# Exploring the arXMLiv dataset

arXMLiv 08.2018 - An HTML5 dataset for arXiv.org Data

In [None]:
import os

DATA_BASE_DIR = "/Volumes/Backup/no_problem"

Let's start by getting a overview over our dataset structure

In [None]:
files = []
directories = []
# r=root, d=directories, f = files
for r, d, f in os.walk(DATA_BASE_DIR):
    for directory in d:
        directories.append(os.path.join(r, directory))
    for file in f:
        if '.html' in file:
            files.append(os.path.join(r, file))

number_of_files = len(files)


print(number_of_files)
print(files[:10])



In [None]:
print(len(directories))
print(directories[:10])

In [None]:
 with open(files[0], "rt") as file:
     print(file.read())

We have 337 folders containing 150701 HTML5 documents taking up 60,25 gigabytes of storage. The only useful metadata associated with these files is their [arXiv-Identifier](https://arxiv.org/help/arxiv_identifier) which is used as the filename. 



## Text extraction



In [None]:
from bs4 import BeautifulSoup

def read_file(file):
    with open(file, "rt") as file:
        return file.read()

raw_file = read_file(files[0])
soup = BeautifulSoup(raw_file, features="html.parser")
print(soup.get_text())




In [None]:
print("Size reduction {:f}".format(len(soup.get_text()) / len(raw_file)))

We can extract the text relatively easily with BeautifulSoup. The text looks quite usable on the first glance and even this simple preprocessing dropped the size of the content down to 29% of the original size.
But a closer look reveals artifacts like ```POSTSUBSCRIPT:start italic-nu POSTSUBSCRIPT:end OPEN:( italic-t CLOSE:)```. Additional postprocessing is needed.

Let's have a look at the unusual html-tags in the file.


In [None]:
def unusual_tags(soup):
    # preload of some very common tags to reduce noise in the output
    usual_tags = ['html', 'head', 'title', 'meta', 'body', 'div', 'article', 'p', 'section', 'span']
    unusual_tag_types = []
    unusual_content = []

    for tag in soup.find_all():
        if tag.name not in usual_tags:
            if tag.name not in unusual_content:
                unusual_tag_types.append(tag.name)
                unusual_content.append(tag)
            
    print(unusual_tag_types)
    print(unusual_content)

unusual_tags(soup)




Skimming the list of ununusal tags shows the repeated occurence of ```MathMl```-tags and the footer only containing the note that LateXml was used for the conversion. Dropping these should further clean up the text.

In [None]:
soup = BeautifulSoup(read_file(files[0]), features="html.parser")
for script in soup(["math", "footer"]):
    script.extract()

unusual_tags(soup)

reduced_text = soup.get_text()



In [None]:
print("Size reduction {:f}".format(len(soup.get_text()) / len(raw_file)))

In [None]:
print(soup.get_text())

## Combining the techniques 

Let's use these techniques and combine them with some normal string cleaning to create a cleanup function.

In [None]:
import re

def get_text(file):
    soup = BeautifulSoup(read_file(file), features="html.parser")
    for script in soup(["math", "footer"]):
        script.extract()
    text = soup.get_text()
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"n't|'[A-Za-z]+", '', text) # drop the all contrations since they are all stopwords I've -> I
    text = re.sub(r'[^a-zA-Z\s:]', '', text) # drop non alphabetic characters
    text = re.sub(" +", " ", text)

    return text

#print(get_text(files[0]))


for file in files[0:10]:
    get_text(file)


In [None]:
import spacy 
import re
import sys

from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer
from tqdm.auto import tqdm

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'tagger', 'parser']) # and disable the tagger, parser and ner.
stopwords = nlp.Defaults.stop_words # load the list of stopwords from spacy for the English language
stemmer = PorterStemmer() ### initialize the stemmer from NLTK

def remove_stopwords_and_stem(text):
    tokens_without_stopwords = [token for token in nlp(text) if not token.text in stopwords]
    stemmed_tokens = [stemmer.stem(token.text) for token in tokens_without_stopwords if len(token.text) > 1]
    text = " ".join(stemmed_tokens)
    return(re.sub("\s{2,}", " ", text))

print(remove_stopwords_and_stem(get_text(files[0])))
        

## Gather information about the dataset

Let's apply the created cleanup functions onto a subset of the documents and get a feeling for the dataset by running some analysis and trying to model the properties of the complete dataset.
We use the random.sample()-function with a fixed seed to get repeatable results.

A pickeld result of analysing 5000 documents is stored alongside this notbook to speed up execution.

In [None]:
from collections import Counter
from copy import deepcopy
import random 
import pandas as pd

AMOUNT_OF_DOCUMENTS = 10
SEED=42
random.seed(SEED)
chosen_files = random.sample(files,AMOUNT_OF_DOCUMENTS)
nlp.max_length =2000000 

rows = []

complete_counter = Counter()

for index, file in enumerate(chosen_files):
    raw_file = read_file(file)
    raw_length = len(raw_file)
    

    text = get_text(file)
    text_length = len(text)
    
    cleaned_text = remove_stopwords_and_stem(text)
    cleaned_length = len(cleaned_text)

    tokens = cleaned_text.split(" ")

    amount_of_words = len(tokens)

    counter = Counter()
    counter.update(tokens)
    unique_words = len(counter.keys())

    complete_counter.update(counter)
    total_words=len(complete_counter.keys())

    row = [raw_length, text_length, cleaned_length, amount_of_words, unique_words, total_words ]
    rows.append(row)

df = pd.DataFrame(rows, columns=["raw_length", "text_length", "cleaned_length", "amount_of_words", "unique_words", "total_words"])
df



    


In [None]:
df= pd.read_pickle("results_5000_documents_seed_42")

def compression_factor(cleaned, raw):
    return (cleaned/raw)

df['compression_factor'] = compression_factor(df['text_length'], df['raw_length'])
df

We are meassuring different properties of the dataset.
The ```_length``-fields store the string length in characters after reading the file, cleaning it and removing the stopwords and tokenizing it.

```amount_of_words```, ```unique_words``` and ```total_words``` are using the stemmed strings. ```total_words``` is more than a sum of ```unique_words```. It is the amount of keys stored in a ```collections.Counter```-instance that is used for all files.

Looking at the distributions shows nothing unexpected. Every attribute is normaly distributed with a long tail to the right.


In [None]:

ax = df.loc[:, df.columns != 'total_words'].hist(layout=(6,1),bins=100, figsize=(8,15))


In [None]:
import collections
word_length = [len(x) for x in complete_counter.keys()]
word_length_df = pd.DataFrame(word_length)

word_length_df.hist(bins=max(word_length)-min(word_length))
plt.title("token length distribution")


The amount of unique tokens seems to resemble a logarithmic growth curve. This matches the intuition that each additional document can only add so many new token to the set of already seen tokens.


In [None]:
df.loc[:, df.columns == 'total_words'].plot.line(layout=(1,1), figsize=(10,6))


In [None]:
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np

yn=df.loc[:, df.columns == 'total_words'].values.flatten()

x=range(5000)

def logFunc( x, a, b, c, d ):
    return a*np.log( b*x + c ) + d

popt, pcov = curve_fit(logFunc, x, yn)

print(popt)

plt.plot(x,yn, label="Original data")
plt.plot(x, logFunc(x,popt[0],popt[1],popt[2],popt[3]), label="Fitted curve")
plt.legend(loc='upper left')
plt.title('Unique tokens prediction')
plt.xlabel('Number of documents')
plt.ylabel('Number of unique tokens')
plt.show()

def predict_unique_token_count(x):
    return logFunc(x, popt[0],popt[1],popt[2],popt[3])



We can fit a function and try to estimate the total amount of unique tokens in the dataset

In [None]:
unique_tokens = predict_unique_token_count(number_of_files)
print("The number of unique tokens for the complete dataset is modeled to be around {}".format(unique_tokens))

We can repeat this approach to get an estimation of the total amount of tokens contained in the dataset.

In [None]:
df.loc[:, df.columns == 'amount_of_words'].cumsum().plot.line(layout=(1,1), figsize=(10,6))
yn=df.loc[:, df.columns == 'amount_of_words'].cumsum().values.flatten()


In [None]:
from numpy.polynomial import Polynomial

cmin, cmax = min(x), max(x)
pfit, stats = Polynomial.fit(x, yn, 1, full=True, window=(cmin, cmax),
                                                    domain=(cmin, cmax))

plt.plot(x,yn, label="Original data")
plt.plot(x, pfit(x), label="Fitted curve")
plt.legend(loc='upper left')
plt.title('Amount of tokens')
plt.xlabel('Number of documents')
plt.ylabel('Number of tokens')
plt.show()



In [None]:
amount_of_tokens = pfit(unique_tokens)
print("The number of unique tokens for the complete dataset is modeled to be around {}".format(unique_tokens))