###### Last preprocessing step. Removes all non-relevant articles from our corpus. Removes non-relevant information from arxiv-full-database.
###### Adds word count for each context to the data frame
###### Saves the result in database-files

In [None]:
import pandas as pd
import numpy as np
import time
import nltk
import glob
import os
from tqdm.notebook import tqdm

In [None]:
def flatten(list):
    return "".join([item for sublist in list for item in sublist])
path = '/Users/Svesketerning/Google-Drev/experiments/database-files'
timestr = time.strftime("%Y%m%d-%H%M")

In [None]:
df_arxiv = pd.read_pickle(r'arxiv-full-database-20210427-1638.pickle')
keep = ['title','abstract','authors_parsed',
        'numauthors','categories_parsed','numcats','maincat',
       'nummathcats','mainmathcat']
keep_og = ['title','abstract','authors_parsed','numpages',
        'numauthors','categories_parsed','numcats','maincat',
       'nummathcats','mainmathcat']
df_arxiv = df_arxiv[keep] # Keep only relevant columns

df_arxiv['maincat'].replace({'cs.IT': 'math.IT', 'math-ph': 'math.MP'}, inplace=True)
df_arxiv = df_arxiv.drop(df_arxiv[df_arxiv.maincat != df_arxiv.mainmathcat].index) 
# Remove articles without mainmathcat=maincat

df_contexts = pd.read_feather('arxiv-contexts-20210428-1144.feather')
contexts = [x for x in df_contexts.columns if x!='arxivid']
for c in contexts:
    df_contexts[c] = df_contexts[c].apply(flatten)
df_contexts = df_contexts.set_index('arxivid')
df_contexts.index.names = ['id'] # ArXiv id as index 

article_list = df_arxiv.index.tolist()
df_contexts = df_contexts[df_contexts.index.isin(article_list)] # Drop rows if index not in df_arxiv
# Outcome is all articles with maincat a mathematics category alongside contexts

In [None]:
# word count for each context to the data frame for each article
contextwordcount_df = [] # Counting words in each context per article
for index in range(len(df_contexts.index)):
    counts_article = []
    for c in contexts:
        count = len(nltk.word_tokenize(df_contexts[c][index]))
        counts_article.append(count)
    contextwordcount_df.append({'id':df_contexts.index[index], # Non functional as I list each context
                                'outer': counts_article[0], 'theorem': counts_article[1], 
                                'meta': counts_article[2], 'proof': counts_article[3], 
                                'other': counts_article[4]})
contextwordcount_df = pd.concat([pd.DataFrame(contextwordcount_df[i], index=['id']) for i in range(len(contextwordcount_df))]) 
contextwordcount_df = contextwordcount_df.set_index('id')
# merge to df_arxiv
df_arxiv = df_arxiv.merge(contextwordcount_df, left_index=True, right_index=True)

In [None]:
df_arxiv = df_arxiv.reset_index()
df_contexts = df_contexts.reset_index()
df_contexts.to_feather(path+'/arxiv_contexts'+timestr+'.feather')
df_arxiv.to_feather(path+'/arxiv_extended_database'+timestr+'.feather')

In [None]:
# Also checks for Ramos Corpus
LatestDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_extended_database*'),key=os.path.getctime)
df_arxiv = pd.read_feather(LatestDatabaseFile)

cat_df = pd.DataFrame(columns=['Count 2020','Count 2009','pct 2020','pct 2009'])

for i in df_arxiv['mainmathcat'].unique().tolist():
    values_to_add = {'Count 2020': len(df_arxiv[df_arxiv['mainmathcat']==i]),
                     'Count 2009': len(df_arxiv_ramos[df_arxiv_ramos['mainmathcat']==i]),
                     'pct 2020': len(df_arxiv[df_arxiv['mainmathcat']==i])/12990, 
                     'pct 2009': len(df_arxiv_ramos[df_arxiv_ramos['mainmathcat']==i])/9686
                    }
    row_to_add = pd.Series(values_to_add, name = i)
    cat_df = cat_df.append(row_to_add)

In [None]:
import textdistance 

list_2020 = cat_df.sort_values(by = 'pct 2020').index.to_list()
list_2009 = cat_df.sort_values(by = 'pct 2009').index.to_list()

textdistance.jaro_winkler(list_2020,list_2009)
sum(np.abs(cat_df['pct 2020']-cat_df['pct 2009']))/32

### RamosMejia2009 Cropus check 
#### using my Pipeline (removing all articles where maincat != mainmathcat)

In [None]:
df_ramos = pd.read_feather(r'corpus-MejiaRamos2019a-contexts-20210516-1831.feather')
contexts = [x for x in df_ramos.columns if x!='arxivid']
for c in contexts:
    df_ramos[c] = df_ramos[c].apply(flatten)
df_ramos = df_ramos.set_index('arxivid')
df_ramos.index.names = ['id'] # ArXiv id as index 
df_ramos

In [None]:
df_arxiv['maincat'].replace({'cs.IT': 'math.IT', 'math-ph': 'math.MP'}, inplace=True)
df_arxiv = df_arxiv.drop(df_arxiv[df_arxiv.maincat != df_arxiv.mainmathcat].index) 

In [None]:
df_arxiv_ramos = pd.read_feather(r'corpus-MejiaRamos2019a-metadata.feather')
df_arxiv_ramos['maincat'].replace({'cs.IT': 'math.IT', 'math-ph': 'math.MP'}, inplace=True)
df_arxiv_ramos.drop(df_arxiv_ramos[df_arxiv_ramos.maincat != df_arxiv_ramos.mainmathcat].index, inplace=True) 


df_ramos = pd.read_feather(r'corpus-MejiaRamos2019a-contexts-20210516-1831.feather')
contexts = [x for x in df_ramos.columns if x!='arxivid']
for c in contexts:
    df_ramos[c] = df_ramos[c].apply(flatten)
df_ramos = df_ramos.set_index('arxivid')
df_ramos.index.names = ['id'] # ArXiv id as index 

article_list = df_arxiv_ramos.arxivid.tolist()
df_ramos = df_ramos[df_ramos.index.isin(article_list)]

In [None]:
# word count for each context to the data frame for each article
contextwordcount_df = [] # Counting words in each context per article
for index in tqdm(range(len(df_ramos.index))):
    counts_article = []
    for c in contexts:
        count = len(nltk.word_tokenize(df_ramos[c][index]))
        counts_article.append(count)
    contextwordcount_df.append({'id':df_ramos.index[index], # Non functional as I list each context
                                'outer': counts_article[0], 'theorem': counts_article[1], 
                                'meta': counts_article[2], 'proof': counts_article[3], 
                                'other': counts_article[4]})
contextwordcount_df = pd.concat([pd.DataFrame(contextwordcount_df[i], index=['id']) for i in range(len(contextwordcount_df))]) 
contextwordcount_df = contextwordcount_df.set_index('id')
# merge to df_arxiv
df_arxiv_ramos.set_index('arxivid', inplace=True)
df_arxiv_ramos.index.names = ['id']
df_arxiv_ramos = df_arxiv_ramos.merge(contextwordcount_df, left_index=True, right_index=True)



In [None]:
df_arxiv_ramos.reset_index().to_feather(path+'/ramos2009a_arxiv_extended_database'+timestr+'.feather')
df_ramos.reset_index().to_feather(path+'/ramos2009a_arxiv_contexts'+timestr+'.feather')

In [None]:
numb_cat2 = []
for i in df_arxiv_ramos['mainmathcat'].unique().tolist():
    numb_cat2.append([i,len(df_arxiv_ramos[df_arxiv_ramos['mainmathcat']==i])])