# Base Experiments 

In [27]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import os
import time
import dataframe_image as dfi
import statsmodels.api as sm
import statsmodels.formula.api as smf


path = '/Users/Svesketerning/Google-Drev/experiments'
timestr = time.strftime("%Y%m%d-%H%M")

## Functions for counts over mainmath, context and mainmath and context

### Context 

In [4]:
def raw_wordcount_context(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    values_context = []
    for j in contexts:
        values_context.append(len(focusword_df.loc[(focusword_df['context'] == j)]))
    result.append({'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    return df
def freq_wordcount_context(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    values_context = []
    for j in contexts:
        a = len(focusword_df.loc[(focusword_df['context'] == j)])
        b = df_arxiv[j].sum()
        if b>0:
            values_context.append(a/b * 10**6)
        else:
            values_context.append(None)
    result.append({'outer': values_context[0],'theorem': values_context[1],
                   'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    return df

### Branch

In [20]:
def raw_wordcount_branch(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        a =len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        result.append({'mainmathcat': i, 'focuswords': a})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df
def freq_wordcount_branch(focusword_df,df_arxiv):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][['outer','theorem','meta',
                                                               'proof','other']].sum().sum()
        if b>0:
            c = a/b * 10**6
        else:
            c = None
        result.append({'mainmathcat': i, 'focuswords': c})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df

### Context & Branch 

In [18]:
# Counts the raw focusword count based on context and branch
def raw_wordcount(focusword_df,df_arxiv):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        for j in contexts:
            values_context.append(len(focusword_df.loc[(focusword_df['mainmathcat'] == i) 
                                                       & (focusword_df['context'] == j)]))
        result.append({'mainmathcat':i, 'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    df = df.append(df.sum(numeric_only=True), ignore_index=True)
    return df
# Counts the frequency (per million word) of focusword based on context and branch
def freq_wordcount(focusword_df,df_arxiv):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        for j in contexts:
            a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i) 
                                     & (focusword_df['context'] == j)])
            b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][j].sum()
            if b>0:
                values_context.append(a/b * 10**6)
            else:
                values_context.append(None)
        result.append({'mainmathcat':i, 'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    total_freq = []
    freq_dict = []
    for j in contexts:
        a = len(focusword_df.loc[(focusword_df['context'] == j)])/df_arxiv[j].sum()* 10**6
        total_freq.append(a)
    freq_dict.append({'mainmathcat':'Total Freq.', 'outer': total_freq[0],'theorem': total_freq[1],
                      'meta': total_freq[2], 'proof': total_freq[3], 'other': total_freq[4]})
    df = df.append(freq_dict, ignore_index=True)
    return df

### Explanation base data, dataframes and Latex 

In [28]:
LatestDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_extended_database*'),key=os.path.getctime)
df_arxiv = pd.read_feather(LatestDatabaseFile)

LatestExplanationFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/explain_fullraw*'),key=os.path.getctime)
explanation_df = pd.read_feather(LatestExplanationFile)

In [6]:
# Saves 6 files and 6 Latex tables 

explain_contextbranch_raw = raw_wordcount(explanation_df)
explain_contextbranch_raw.to_feather(path+'/explanation-data/explain_contextbranch_raw'+timestr+'.feather')
# Nicer in TeX
explain_contextbranch_raw[['outer', 'theorem', 'meta', 'proof', 'other']] = explain_contextbranch_raw[['outer', 'theorem', 'meta', 'proof', 'other']].astype(int)
explain_contextbranch_raw.loc[32,'mainmathcat'] = 'Total'
explain_contextbranch_raw.to_latex(path+'/tex/explain_contextbranch_raw'+timestr+'.tex', index = False)

explain_context_raw = raw_wordcount_context(explanation_df)
explain_context_raw.to_feather(path+'/explanation-data/explain_context_raw'+timestr+'.feather')
explain_context_raw.to_latex(path+'/tex/explain_context_raw'+timestr+'.tex')

explain_branch_raw = raw_wordcount_branch(explanation_df)
explain_branch_raw.to_feather(path+'/explanation-data/explain_branch_raw'+timestr+'.feather')
explain_branch_raw.to_latex(path+'/tex/explain_branch_raw'+timestr+'.tex')

explain_contextbranch_freq = freq_wordcount(explanation_df)
explain_contextbranch_freq.to_feather(path+'/explanation-data/explain_contextbranch_freq'+timestr+'.feather')
explain_contextbranch_freq.to_latex(path+'/tex/explain_contextbranch_freq'+timestr+'.tex')

explain_context_freq = freq_wordcount_context(explanation_df)
explain_context_freq.to_feather(path+'/explanation-data/explain_context_freq'+timestr+'.feather')
explain_context_freq.to_latex(path+'/tex/explain_context_freq'+timestr+'.tex')

explain_branch_freq = freq_wordcount_branch(explanation_df)
explain_branch_freq.to_feather(path+'/explanation-data/explain_branch_freq'+timestr+'.feather')
explain_branch_freq.to_latex(path+'/tex/explain_branch_freq'+timestr+'.tex')

#### Better context table with total and frequency

In [9]:
explain_context_freq_total = pd.concat([raw_wordcount_context(explanation_df), 
            freq_wordcount_context(explanation_df)])
explain_context_freq_total.insert(0, 'method',['Total','Frequency'])
explain_context_freq_total = explain_context_freq_total.set_index('method').round(2)
explain_context_freq_total.to_latex(path+'/tex/explain_context_count_and_freq'+timestr+'.tex')

### Statistical tests 

#### p-values across all branches.

In [6]:
LatestExplanationContextBranchRawFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/explanation-data/explain_contextbranch_raw*'),key=os.path.getctime)
explain_contextbranch_raw = pd.read_feather(LatestExplanationContextBranchRawFile)

LatestExplanationBranchRawFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/explanation-data/explain_branch_raw*'),key=os.path.getctime)
explain_branch_raw = pd.read_feather(LatestExplanationBranchRawFile)

In [22]:
# Counting words in each branch, sourced from above functions
contexts = ['outer','theorem', 'meta', 'proof', 'other']
result = []
for i in explanation_df['mainmathcat'].unique():
    branch_count = 0
    for j in contexts:
        branch_count += df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][j].sum()
    result.append({'mainmathcat': i, 'count': branch_count})
branch_count_df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
branch_count_df 
explain_branch_raw
branch_merged_df = pd.merge(explain_branch_raw,branch_count_df, on = ['mainmathcat'])
result = []
for i in branch_merged_df['mainmathcat'].unique():
    a = [branch_merged_df[branch_merged_df['mainmathcat'] == i]['focuswords'].item(),
             branch_merged_df[branch_merged_df['mainmathcat'] == i]['count'].item()
         -branch_merged_df[branch_merged_df['mainmathcat'] == i]['focuswords'].item()]
    pvalues = []
    for j in branch_merged_df['mainmathcat'].unique():
        b = [branch_merged_df[branch_merged_df['mainmathcat'] == j]['focuswords'].item(),
             branch_merged_df[branch_merged_df['mainmathcat'] == j]['count'].item()
            -branch_merged_df[branch_merged_df['mainmathcat'] == j]['focuswords'].item()]
        chi2, pvalue, dof, ex = stats.chi2_contingency([a,b], correction=False, lambda_="log-likelihood")
        pvalues.append(pvalue)
    result.append(pvalues)
    
# Saves p-value thingy as png 
pvalues_df = pd.DataFrame(result,
             columns = branch_merged_df['mainmathcat'].unique().tolist(),
             index = branch_merged_df['mainmathcat'].unique().tolist()) 
pvalues_df = pvalues_df.round(4)
pvaluesred_df = pvalues_df.style.applymap(lambda x: 'color: red' if x <= 0.05 and x <=5 else 'color: black')
dfi.export(pvaluesred_df, path+'/figures/pvalues_df.png',max_cols = -1)

In [25]:
alex_jon_categories = [['math.NT'],['math.CT'],['math.AG'],['math.CO'],['math.CV'],
                       ['math.MP'],['math.PR'],['math.AP','math.CA','math.DS','math.SG'],
                       ['math.KT','math.GT','math.AT'],['math.HO','math.GM'],
                       ['math.OC','math.IT', 'math.NA','math.ST'],
                       ['math.OC','math.IT', 'math.NA','math.ST','math.MP'],
                       ['math.MG','math.DG'],['math.FA','math.SP','math.OA'],
                       ['math.GN','math.LO'],['math.RT','math.GR','math.RA','math.AC','math.QA']]
for i in alex_jon_categories[7:]:
    df = pvalues_df[i].loc[i]
    df.to_latex(path+'/tex/pvalue'+str(i)+timestr+'.tex')


In [26]:
quant_cat = ['math.AT','math.CT','math.DS','math.FA','math.PR']
df = pvalues_df[quali_cat].loc[quant_cat]
df.to_latex(path+'/tex/pvalue'+str(quant_cat)+timestr+'.tex')

Unnamed: 0,math.AT,math.CT,math.DS,math.FA,math.PR
math.AT,1.0,0.0009,0.0,0.0,0.0
math.CT,0.0009,1.0,0.0158,0.0,0.694
math.DS,0.0,0.0158,1.0,0.0,0.0
math.FA,0.0,0.0,0.0,1.0,0.0
math.PR,0.0,0.694,0.0,0.0,1.0


## Ramos Experiments

In [9]:
LatestRamosExplanationFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/explain_ramos_fullraw*'),key=os.path.getctime)
ramos_explanation_df = pd.read_feather(LatestRamosExplanationFile)

LatestRamosFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/ramos2009a_arxiv_contexts*'),key=os.path.getctime)
df_ramos_contexts = pd.read_feather(LatestRamosFile)

LatestRamosDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/ramos2009a_arxiv_extended_database*'),key=os.path.getctime)
df_arxiv_ramos = pd.read_feather(LatestRamosDatabaseFile)

In [25]:
ramos_explain_branch_raw = raw_wordcount_branch(ramos_explanation_df)
ramos_explain_branch_raw.to_feather(path+'/explanation-data/ramos_explain_branch_raw'+timestr+'.feather')
ramos_explain_branch_raw.to_latex(path+'/tex/ramos_explain_branch_raw'+timestr+'.tex')

##### Not super well documented code to test specific p-values, e.g. using the Pease et al. key

In [158]:
contexts = ['outer','theorem', 'meta', 'proof', 'other']
result = []
for j in contexts:
    branch_count = 0
    for i in explanation_df['mainmathcat'].unique():
        branch_count += df_arxiv_ramos.loc[(df_arxiv_ramos['mainmathcat'] == i)][j].sum()
    result.append({'context': j, 'count': branch_count})
result
print(43162407+4089833+2442936+10369089+4033801)

64098066


In [None]:
i,j = ['math.HO','math.GM']

a = [branch_merged_df[branch_merged_df['mainmathcat'] == i]['focuswords'].item(),
     branch_merged_df[branch_merged_df['mainmathcat'] == i]['count'].item()
     -branch_merged_df[branch_merged_df['mainmathcat'] == i]['focuswords'].item()]

b = [branch_merged_df[branch_merged_df['mainmathcat'] == j]['focuswords'].item(),
     branch_merged_df[branch_merged_df['mainmathcat'] == j]['count'].item()
     -branch_merged_df[branch_merged_df['mainmathcat'] == j]['focuswords'].item()]
    
cont_tbl = [a,b]
t22 = sm.stats.Table2x2(cont_tbl)
print(t22.summary())

In [160]:
df_arxiv_ramos

Unnamed: 0,id,mainmathcat,maincat,outer,theorem,meta,proof,other
0,0901.0015,math.IT,math.IT,2140,585,305,706,125
1,0901.0019,math.DG,math.DG,5105,109,33,36,127
2,0901.0020,math.QA,math.QA,9037,496,150,334,292
3,0901.0021,math.AG,math.AG,2299,225,26,1860,56
4,0901.0022,math.AP,math.AP,5212,436,16,618,66
...,...,...,...,...,...,...,...,...
9681,0908.2751,math.RA,math.RA,4861,683,51,0,51
9682,0908.2942,math.AP,math.AP,7214,0,0,0,3327
9683,0908.3174,math.CO,math.CO,2696,391,636,1012,262
9684,0908.3636,math.NA,math.NA,1433,0,0,0,147


In [None]:
LatestFullRawPease = max(glob.iglob(path+'/pease-data/pease_fullraw*'), key=os.path.getctime)
pease_df = pd.read_feather(LatestFullRawPease)

In [37]:
outer = 2721.133298/10**6*56794590
meta = 4105.898081/10**6*4254638
proof = 8009.22507/10**6*19417956
print((outer+meta)/(56794590+4254638) *10**6)
cat1, cat2 = 'math.HO','math.GM'
branch_count1 = 0
branch_count2 = 0
for j in contexts:
    branch_count1 += df_arxiv.loc[(df_arxiv['mainmathcat'] == cat1)][j].sum()
    branch_count2 += df_arxiv.loc[(df_arxiv['mainmathcat'] == cat2)][j].sum()
explanation_tuple = [[len(explanation_df[explanation_df.mainmathcat==cat1]),branch_count1],
[len(explanation_df[explanation_df.mainmathcat==cat2]),branch_count2]]

2817.640216430051


In [170]:
temp_tuple = [[8593,64098066],[11291,94923935]]
print(sm.stats.Table2x2(temp_tuple).summary())
print('difference words',30892695+31289569-64098066)
ramos_expla = [4970+5087,30892695+31289569]
my_expla = [11291,94923935]
[ramos_expla,my_expla]
print(sm.stats.Table2x2([ramos_expla,my_expla]).summary())
sm.stats.Table2x2([[156,209],[113,252]]).summary()
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency([[156,209],[113,252]], correction = False)
p

               Estimate   SE   LCB   UCB  p-value
-------------------------------------------------
Odds ratio        1.127       1.096 1.159   0.000
Log odds ratio    0.120 0.014 0.092 0.148   0.000
Risk ratio        1.127       1.096 1.159   0.000
Log risk ratio    0.120 0.014 0.092 0.148   0.000
-------------------------------------------------
difference words -1915802
               Estimate   SE   LCB   UCB  p-value
-------------------------------------------------
Odds ratio        1.360       1.324 1.397   0.000
Log odds ratio    0.307 0.014 0.280 0.334   0.000
Risk ratio        1.360       1.324 1.397   0.000
Log risk ratio    0.307 0.014 0.280 0.334   0.000
-------------------------------------------------


0.0009697487127103603

In [44]:
# Check context percentages
total_my = df_arxiv[['outer','theorem','meta','proof','other']].sum().sum()

outer pct: 0.5983168523302368
outer pct: 0.6733808006001304
theorem pct: 0.0651204461761936
theorem pct: 0.06380587208356646
meta pct: 0.04482155106612468
meta pct: 0.03811247596768364
proof pct: 0.20456332746846198
proof pct: 0.1617691398052478
other pct: 0.0871778229589829
other pct: 0.06293171154337168


126.9057747369476

In [89]:
for i in range(1,13):
    if i < 10:
        print(sum(df_arxiv['id'].str.contains('200'+str(i)+'.')),'200'+str(i)+'.')
    else:
        print(sum(df_arxiv['id'].str.contains('20'+str(i)+'.')),'20'+str(i)+'.')

0 2001.
0 2002.
0 2003.
0 2004.
2084 2005.
2306 2006.
2263 2007.
2010 2008.
2278 2009.
2049 2010.
0 2011.
0 2012.
