# Exploring the arXMLiv dataset

arXMLiv 08.2018 - An HTML5 dataset for arXiv.org Data

In [None]:
import os

DATA_BASE_DIR = "/Volumes/Backup/no_problem"

Let's start by getting a overview over our dataset structure

In [None]:
files = []
directories = []
# r=root, d=directories, f = files
for r, d, f in os.walk(DATA_BASE_DIR):
    for directory in d:
        directories.append(os.path.join(r, directory))
    for file in f:
        if '.html' in file:
            files.append(os.path.join(r, file))

print(len(files))
print(files[:10])



In [None]:
print(len(directories))
print(directories[:10])

In [None]:
 with open(files[0], "rt") as file:
     print(file.read())

We have 337 folders containing 150701 HTML5 documents taking up 60,25 gigabytes of storage. The only useful metadata associated with these files is their [arXiv-Identifier](https://arxiv.org/help/arxiv_identifier) which is used as the filename. 



## Text extraction



In [None]:
from bs4 import BeautifulSoup

def read_file(file):
    with open(file, "rt") as file:
        return file.read()

raw_file = read_file(files[0])
soup = BeautifulSoup(raw_file, features="html.parser")
print(soup.get_text())




In [None]:
print("Size reduction {:f}".format(len(soup.get_text()) / len(raw_file)))

We can extract the text relatively easily with BeautifulSoup. The text looks quite usable on the first glance and even this simple preprocessing dropped the size of the content down to 29% of the original size.
But a closer look reveals artifacts like ```POSTSUBSCRIPT:start italic-nu POSTSUBSCRIPT:end OPEN:( italic-t CLOSE:)```. Additional postprocessing is needed.

Let's have a look at the unusual html-tags in the file.


In [None]:
def unusual_tags(soup):
    # preload of some very common tags to reduce noise in the output
    usual_tags = ['html', 'head', 'title', 'meta', 'body', 'div', 'article', 'p', 'section', 'span']
    unusual_tag_types = []
    unusual_content = []

    for tag in soup.find_all():
        if tag.name not in usual_tags:
            if tag.name not in unusual_content:
                unusual_tag_types.append(tag.name)
                unusual_content.append(tag)
            
    print(unusual_tag_types)
    print(unusual_content)

unusual_tags(soup)




Skimming the list of ununusal tags shows the repeated occurence of ```MathMl```-tags. Dropping these should further clean up the text.

In [None]:
soup = BeautifulSoup(read_file(files[0]), features="html.parser")
for script in soup(["math"]):
    script.extract()

unusual_tags(soup)

reduced_text = soup.get_text()

In [None]:
print("Size reduction {:f}".format(len(soup.get_text()) / len(raw_file)))

In [None]:
print(soup.get_text())