# Base Experiments 

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
import os
import time
import dataframe_image as dfi
path = '/Users/Svesketerning/Google-Drev/experiments'
timestr = time.strftime("%Y%m%d-%H%M")

## Functions for counts over mainmath, context and mainmath and context

### Context 

In [2]:
def raw_wordcount_context(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    values_context = []
    for j in contexts:
        values_context.append(len(focusword_df.loc[(focusword_df['context'] == j)]))
    result.append({'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    return df
def freq_wordcount_context(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    values_context = []
    for j in contexts:
        a = len(focusword_df.loc[(focusword_df['context'] == j)])
        b = df_arxiv[j].sum()
        if b>0:
            values_context.append(a/b * 10**6)
        else:
            values_context.append(None)
    result.append({'outer': values_context[0],'theorem': values_context[1],
                   'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    return df

### Branch

In [3]:
def raw_wordcount_branch(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        a =len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        result.append({'mainmathcat': i, 'focuswords': a})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df
def freq_wordcount_branch(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][['outer','theorem','meta',
                                                               'proof','other']].sum().sum()
        if b>0:
            c = a/b * 10**6
        else:
            c = None
        result.append({'mainmathcat': i, 'focuswords': c})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df

### Context & Branch 

In [4]:
# Counts the raw focusword count based on context and branch
def raw_wordcount(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        for j in contexts:
            values_context.append(len(focusword_df.loc[(focusword_df['mainmathcat'] == i) 
                                                       & (focusword_df['context'] == j)]))
        result.append({'mainmathcat':i, 'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    df = df.append(df.sum(numeric_only=True), ignore_index=True)
    return df
# Counts the frequency (per million word) of focusword based on context and branch
def freq_wordcount(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        for j in contexts:
            a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i) 
                                     & (focusword_df['context'] == j)])
            b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][j].sum()
            if b>0:
                values_context.append(a/b * 10**6)
            else:
                values_context.append(None)
        result.append({'mainmathcat':i, 'outer': values_context[0],'theorem': values_context[1],
                       'meta': values_context[2], 'proof': values_context[3], 'other': values_context[4]})
    df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    total_freq = []
    freq_dict = []
    for j in contexts:
        a = len(explanation_df.loc[(explanation_df['context'] == j)])/df_arxiv[j].sum()* 10**6
        total_freq.append(a)
    freq_dict.append({'mainmathcat':'Total Freq.', 'outer': total_freq[0],'theorem': total_freq[1],
                      'meta': total_freq[2], 'proof': total_freq[3], 'other': total_freq[4]})
    df = df.append(freq_dict, ignore_index=True)
    return df

### Explanation base data, dataframes and Latex 

In [5]:
LatestDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_extended_database*'),key=os.path.getctime)
df_arxiv = pd.read_feather(LatestDatabaseFile)

LatestExplanationFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/explain_fullraw*'),key=os.path.getctime)
explanation_df = pd.read_feather(LatestExplanationFile)

In [None]:
# Saves 6 files and 6 Latex tables 

explain_contextbranch_raw = raw_wordcount(explanation_df)
explain_contextbranch_raw.to_feather(path+'/explanation-data/explain_contextbranch_raw'+timestr+'.feather')
explain_contextbranch_raw.to_latex(path+'/tex/explain_contextbranch_raw'+timestr+'.tex')

explain_context_raw = raw_wordcount_context(explanation_df)
explain_context_raw.to_feather(path+'/explanation-data/explain_context_raw'+timestr+'.feather')
explain_context_raw.to_latex(path+'/tex/explain_context_raw'+timestr+'.tex')

explain_branch_raw = raw_wordcount_branch(explanation_df)
explain_branch_raw.to_feather(path+'/explanation-data/explain_branch_raw'+timestr+'.feather')
explain_branch_raw.to_latex(path+'/tex/explain_branch_raw'+timestr+'.tex')

explain_contextbranch_freq = freq_wordcount(explanation_df)
explain_contextbranch_freq.to_feather(path+'/explanation-data/explain_contextbranch_freq'+timestr+'.feather')
explain_contextbranch_freq.to_latex(path+'/tex/explain_contextbranch_freq'+timestr+'.tex')

explain_context_freq = freq_wordcount_context(explanation_df)
explain_context_freq.to_feather(path+'/explanation-data/explain_context_freq'+timestr+'.feather')
explain_context_freq.to_latex(path+'/tex/explain_context_freq'+timestr+'.tex')

explain_branch_freq = freq_wordcount_branch(explanation_df)
explain_branch_freq.to_feather(path+'/explanation-data/explain_branch_freq'+timestr+'.feather')
explain_branch_freq.to_latex(path+'/tex/explain_branch_freq'+timestr+'.tex')

#### Better context table with total and frequency

In [71]:
explain_context_freq_total = pd.concat([raw_wordcount_context(explanation_df), 
            freq_wordcount_context(explanation_df)])
explain_context_freq_total.insert(0, 'method',['Total','Frequency'])
explain_context_freq_total = explain_context_freq_total.set_index('method').round(2)
explain_context_freq_total.to_latex(path+'/tex/explain_context_count_and_freq'+timestr+'.tex')

### Statistical tests 

#### p-values across all branches.

In [None]:
LatestExplanationContextBranchRawFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/explanation-data/explain_contextbranch_raw*'),key=os.path.getctime)
explain_contextbranch_raw = pd.read_feather(LatestExplanationContextBranchRawFile)

LatestExplanationBranchRawFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/explanation-data/explain_branch_raw*'),key=os.path.getctime)
explain_branch_raw = pd.read_feather(LatestExplanationBranchRawFile)
explain_branch_raw

In [None]:
# Counting words in each branch, sourced from above functions
contexts = ['outer','theorem', 'meta', 'proof', 'other']
result = []
for i in explanation_df['mainmathcat'].unique():
    branch_count = 0
    for j in contexts:
        branch_count += df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][j].sum()
    result.append({'mainmathcat': i, 'count': branch_count})
branch_count_df = pd.concat([pd.DataFrame(result[i], index=['mainmathcat']) for i in range(len(result))])
branch_count_df 
explain_branch_raw
branch_merged_df = pd.merge(explain_branch_raw,branch_count_df, on = ['mainmathcat'])
result = []
for i in branch_merged_df['mainmathcat'].unique():
    a = [branch_merged_df[branch_merged_df['mainmathcat'] == i]['focuswords'].item(),
             branch_merged_df[branch_merged_df['mainmathcat'] == i]['count'].item()]
    pvalues = []
    for j in branch_merged_df['mainmathcat'].unique():
        b = [branch_merged_df[branch_merged_df['mainmathcat'] == j]['focuswords'].item(),
             branch_merged_df[branch_merged_df['mainmathcat'] == j]['count'].item()]
        oddsratio, pvalue = stats.fisher_exact([a,b])
        pvalues.append(pvalue)
    result.append(pvalues)
    
# Saves p-value thingy as png 
pvalues_df = pd.DataFrame(result,
             columns = branch_merged_df['mainmathcat'].unique().tolist(),
             index = branch_merged_df['mainmathcat'].unique().tolist()) 
pvalues_df = pvalues_df.style.applymap(lambda x: 'color: red' if x <= 0.05 and x <=5 else 'color: black')
dfi.export(pvalues_df, path+'/figures/pvalues_df.png',max_cols = -1)