In [23]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import nltk
import glob
import os
import time
from tqdm.notebook import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")
# prefer_gpu() https://spacy.io/api/top-level#spacy.prefer_gpu?

path = '/Users/Svesketerning/Google-Drev/experiments'
timestr = time.strftime("%Y%m%d-%H%M")

In [2]:
def freq_wordcount_branch(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][['outer','theorem','meta',
                                                               'proof','other']].sum().sum()
        if b>0:
            c = a/b * 10**6
        else:
            c = None
        result.append({'mainmathcat': i, 'focuswords': c})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df
#from explain_words_extraction.ipynb import word_extract
def word_extract(dataframe_context,focus_words):
    result = []
    #contexts = [x for x in dataframe_context.columns]
    contexts = ['outer', 'theorem', 'meta', 'proof', 'other']
    for index in tqdm(range(len(dataframe_context.index))):
        for c in contexts:
            sentences = nltk.sent_tokenize(dataframe_context[c][index])
            for sent in sentences:
                sent = sent.lower()
                overlap = [x for x in focus_words if x in sent]
                if len(overlap)>0:
                    arc_id = df_contexts['id'][index]
                    result.append({'id':arc_id, # NOTE THAT YOU HAVE TO LOAD ARXIV DATABASE FILE AS df_arxiv!!
                                   'mainmathcat': df_arxiv.loc[df_arxiv['id'] == arc_id]['mainmathcat'].item(),
                                   'context': c, 'sentence': sent, 'focuswords': ",".join(overlap)})
    df = pd.DataFrame(result)
    df = df.set_index('id')
    return df 
# From explanations-theorems-proofs - extract rows with sentences that have a focusword
def sentence_extract(df,focuswords, contexts = None, remove_focuswords = False):
    if contexts is not None: # Focus on context
        df = df[df['context'].isin(contexts)]
    for i in tqdm(df.index):
        sent = df.loc[i,'sentence']
        tokenize_sent = nltk.word_tokenize(sent)
        if any(x in focuswords for x in tokenize_sent) is remove_focuswords:
            df = df.drop(index=i)
    return df 
def adj_nouns(df, focus_noun, contexts = None): # Looks for adjectives around a noun in "noun"_raw-df files
    result = []
    df = df.reset_index()
    if contexts is not None: # Focus on context
        df = df[df['context'].isin(contexts)]
    for i in tqdm(df.index):
        doc = nlp(df.loc[i,'sentence'])
        for chunk in doc.noun_chunks:
            if chunk.root.text in focus_noun:
                for token in chunk:
                    if token.pos_ in 'ADJ':
                        result.append({'id':df.loc[i,'id'],'mainmathcat': df.loc[i,'mainmathcat'],
                                       'context': df.loc[i,'context'], 'sentence': df.loc[i,'sentence'],
                                       'focuswords': df.loc[i,'focuswords'],'adjective': token.text})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    #df = df.set_index('id')
    return df

In [3]:
# Taken from explain words extraction
# Load latest context and ArXiv database file
LatestContextFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_contexts*'),key=os.path.getctime)
df_contexts = pd.read_feather(LatestContextFile)

LatestDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_extended_database*'),key=os.path.getctime)
df_arxiv = pd.read_feather(LatestDatabaseFile)

### Looks at adjectives connected to proofs 
#### Using the adj_nouns function

In [4]:
# Creates and saves the proof_fullraw in database-files
proof_word = ['proof']
proof_raw_df = word_extract(df_contexts,proof_word) # Computational heavy
proof_raw_df = proof_raw_df.reset_index()
proof_raw_df.to_feather(path+'/database-files/proof_fullraw'+timestr+'.feather')

# Creates and saves the understa_fullraw in database-files
#understa_raw_df = word_extract(df_contexts,['understa']) 
#understa_raw_df = understa_raw_df.reset_index()
#understa_raw_df = proof_raw_df.reset_index()
#understa_raw_df.to_feather(path+'/database-files/understa_fullraw'+timestr+'.feather')


HBox(children=(FloatProgress(value=0.0, max=12990.0), HTML(value='')))




In [5]:
# Load latest understa and proof file
LatestUnderstaFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/understa_fullraw*'),key=os.path.getctime)
understa_raw_df = pd.read_feather(LatestUnderstaFile)

LatestProofFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/proof_fullraw*'),key=os.path.getctime)
proof_raw_df = pd.read_feather(LatestProofFile)

In [39]:
proofadj_df = adj_nouns(proof_raw_df,contexts = ['outer','meta','other'],focus_noun = ['proof'])

HBox(children=(FloatProgress(value=0.0, max=87956.0), HTML(value='')))




In [46]:
#list(proofadj_df[proofadj_df['adjective'].isin(['theoretic','theoretical'])].sentence)


In [6]:
# Using adj_nouns and makes a dataframe of counts. 
proofadj_df = adj_nouns(proof_raw_df,contexts = ['outer','meta','other'],focus_noun = ['proof'])
count_adj = proofadj_df['adjective'].value_counts().to_frame()
count_adj.index.name = 'adjective'
count_adj = count_adj.rename(columns={'adjective': 'count'})
count_adj['pct'] = ((count_adj['count'] / count_adj['count'].sum())*100).round(2)
count_adj = count_adj.reset_index()
count_adj.to_feather(path+'/armchair/count_pct_adj_df'+timestr+'.feather')

HBox(children=(FloatProgress(value=0.0, max=87956.0), HTML(value='')))




In [47]:
LatestCountAdjectiveFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/armchair/count_pct_adj_df*'),key=os.path.getctime)
count_adj = pd.read_feather(LatestCountAdjectiveFile)

# Picked from criteria count>=20. - HARDCODED, BUT ALSO CATEGORIES CHOSEN BY HAND
meta = ['new', 'alternative', 'above', 'same', 'different', 'original', 
 'first', 'previous', 'similar', 'alternate', 'second', 'following', 'classical', 'standard', 
 'such','main', 'actual', 'whole']

how_adj = ['direct','complete','full', 'formal','constructive', 
           'independent', 'inductive','bijective', 'uniqueness','theoretical', 'theoretic']


what_adj = ['simple','short','detailed', 'elementary','rigorous','simpler',
            'combinatorial','geometric', 'algebraic','shorter', 'easy', 'general', 
            'quick', 'analytic','simplified', 'elegant', 'unified','technical',
            'straightforward', 'explicit','brief', 'probabilistic', 
            'mathematical']


a = count_adj.loc[count_adj['adjective'].isin(meta)].sum(numeric_only = True).to_frame(name = 'Meta adjectives').transpose()
b = count_adj.loc[count_adj['adjective'].isin(how_adj)].sum(numeric_only = True).to_frame(name = 'Objective adjectives').transpose()
c = count_adj.loc[count_adj['adjective'].isin(what_adj)].sum(numeric_only = True).to_frame(name = 'Subjective adjectives').transpose()
adj_compund = a.append([b,c])
adj_compund.drop('pct', 1, inplace = True)
total = adj_compund['count'].sum()
adj_compund['Percentage'] = [adj_compund['count'][0]/total*100,
                      adj_compund['count'][1]/total*100,
                      adj_compund['count'][2]/total*100]
adj_compund['Percentage'] = adj_compund['Percentage'].round(1)


# Saves both as tables in /tex folder
count_adj.to_latex(path+'/tex/count_adjectives_freq'+timestr+'.tex')
adj_compund.to_latex(path+'/tex/adjectives_classified_freq'+timestr+'.tex')
# Saves both of them to /armchair folder as dataframes
adj_compund = adj_compund.reset_index()

adj_compund.to_feather(path+'/armchair/count_pct_categorized_adj_df'+timestr+'.feather')

In [6]:
#count_adj = count_adj.set_index('adjective')
f1 = ['ingenious', 'striking', 'inspired', 'creative', # Aesthetic
     'beautiful', 'profound', 'elegant','deep',
     'innovative','sublime','charming','clever','appealing',
     'pleasing','enlightning','insightful','bold','strong',
     'subtle','delicate','ambitous','cute','sharp']
f2 = ['practical','informative','applicable','intuitive',  #Usefullness
     'natural','illustrative','efficient','useful',
     'explanatory','conceptual','plausible','fruitful']
f3 = ['difficult','dense','intricate','confusing', # Intricacy
     'tedious','elaborate','simple','abstract','non-trivial'] 
f4 = ['precise','careful','rigorous','accurate','clear',
     'meticulous','polished','unambigous'] # Precision 

data = {'Factors': ['Aesthetic', 'Usefulness', 'Intricacy', 'Precision'],
        'Counts': [count_adj[count_adj['adjective'].isin(f1)]['count'].sum(),
                   count_adj[count_adj['adjective'].isin(f2)]['count'].sum(),
                   count_adj[count_adj['adjective'].isin(f3)]['count'].sum(),
                   count_adj[count_adj['adjective'].isin(f4)]['count'].sum()]
       }
ramos_comparison_df = pd.DataFrame.from_dict(data)
ramos_comparison_df.to_latex(path+'/tex/ramos_comparison'+timestr+'.tex', index = False)
ramos_comparison_df

Unnamed: 0,Factors,Counts
0,Aesthetic,73
1,Usefulness,39
2,Intricacy,481
3,Precision,218


In [24]:
intricacy =['simple', 'short', 'elementary', 'simpler', 
 'easy', 'quick', 'shorter', 'simplified', 'straightforward', 'brief']
aesthetic = ['elegant', 'technical', 'geometric', 'algebraic', 
             'analytic', 'combinatorial', 'probabilistic', 'mathematical'] 
precision = ['detailed', 'rigorous', 'formal', 'general', 'explicit']
usefulness = ['unified']
data = {'Factors': ['Aesthetic', 'Intricacy', 'Precision', 'Usefulness'],
        'Counts': [count_adj[count_adj['adjective'].isin(aesthetic)]['count'].sum(),
                   count_adj[count_adj['adjective'].isin(intricacy)]['count'].sum(),
                   count_adj[count_adj['adjective'].isin(precision)]['count'].sum(),
                  count_adj[count_adj['adjective'].isin(usefulness)]['count'].sum()]
       }
ramos_comparison_df2 = pd.DataFrame.from_dict(data)
ramos_comparison_df2.to_latex(path+'/tex/ramos_comparison2'+timestr+'.tex', index = False)
ramos_comparison_df2

Unnamed: 0,Factors,Counts
0,Aesthetic,474
1,Intricacy,1621
2,Precision,755
3,Usefulness,34


### Is it proofs or figures/diagrams that provides understanding?

In [25]:
# Load explanation_df
LatestExplanationFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/explain_fullraw*'),key=os.path.getctime)
explanation_df = pd.read_feather(LatestExplanationFile)


In [26]:
# Times figure and proof are mentioned in explanation contexts
figure_sent_df = sentence_extract(explanation_df, focuswords = ['figure','figures'],
                                   contexts = None, remove_focuswords = False)
proof_sent_df = sentence_extract(explanation_df, focuswords = ['proof','proofs'], 
                                    contexts = None, remove_focuswords = False)

HBox(children=(FloatProgress(value=0.0, max=11291.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11291.0), HTML(value='')))




In [71]:
#figure_raw_df = word_extract(df_contexts,['figure']) # All the times figure is mentioned - in order to normalize
#proof_only_raw_df = word_extract(df_contexts,['proof'])


proof_only_raw_df = proof_only_raw_df.reset_index()
figure_raw_df = figure_raw_df.reset_index()
proof_only_raw_df.to_feather(path+'/database-files/proof_only_fullraw'+timestr+'.feather') # save as feather
figure_raw_df.to_feather(path+'/database-files/figure_fullraw'+timestr+'.feather')

In [27]:
LatestProofONLYFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/proof_only_fullraw*'),key=os.path.getctime)
proof_only_raw_df = pd.read_feather(LatestProofONLYFile)

LatesFigureFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/figure_fullraw*'),key=os.path.getctime)
figure_raw_df = pd.read_feather(LatesFigureFile)

#### We now have everything we need - times proof/figure is mentioned in expla/understa contexts and how many times they are used in total. We can see it is more often authors uses a explanation word in regards to a proof than to a figure, but nothing is significant.

In [28]:
figure_mention = len(figure_raw_df)
proof_mention = len(proof_only_raw_df) # How many times proof and proofs are used - in order to normalize
figure_explanation = len(figure_sent_df)
proof_explanation = len(proof_sent_df)
# Everything we need

In [34]:
# G-test
cont_tbl = [[proof_explanation,proof_mention],[figure_explanation,figure_mention]]
t22 = sm.stats.Table2x2(cont_tbl)
print(t22.summary(),proof_explanation/proof_mention*100,figure_explanation/figure_mention*100)

               Estimate   SE   LCB    UCB  p-value
--------------------------------------------------
Odds ratio        1.165        0.979 1.386   0.085
Log odds ratio    0.153 0.089 -0.021 0.327   0.085
Risk ratio        1.164        0.979 1.384   0.085
Log risk ratio    0.152 0.088 -0.021 0.325   0.085
-------------------------------------------------- 0.5434362323629283 0.466434973477227


### Use of Obvious - Random

In [320]:
obvious_raw_df = word_extract(df_contexts,['obvious'])
obvious_freq_df = freq_wordcount_branch(obvious_raw_df)

obvious_raw_df = word_extract(df_contexts,['obvious'])
obvious_freq_df = freq_wordcount_branch(obvious_raw_df)
pd.set_option('display.max_rows', 40)
obvious_freq_df.sort_values('focuswords')

Unnamed: 0,mainmathcat,focuswords
17,math.MG,83.690412
0,math.PR,93.221894
4,math.AP,94.814024
10,math.GT,95.132317
25,math.NA,95.837193
1,math.OC,111.919972
7,math.RT,112.144805
11,math.HO,112.585847
2,math.ST,113.373482
12,math.DS,116.093658
