In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import nltk
import glob
import os
import time

import spacy
nlp = spacy.load("en_core_web_sm")
# prefer_gpu() https://spacy.io/api/top-level#spacy.prefer_gpu?

path = '/Users/Svesketerning/Google-Drev/experiments'
timestr = time.strftime("%Y%m%d-%H%M")

In [2]:
def freq_wordcount_branch(focusword_df):
    result = []
    contexts = ['outer','theorem', 'meta', 'proof', 'other'] # Non functional!
    for i in focusword_df['mainmathcat'].unique():
        values_context = []
        a = len(focusword_df.loc[(focusword_df['mainmathcat'] == i)])
        b = df_arxiv.loc[(df_arxiv['mainmathcat'] == i)][['outer','theorem','meta',
                                                               'proof','other']].sum().sum()
        if b>0:
            c = a/b * 10**6
        else:
            c = None
        result.append({'mainmathcat': i, 'focuswords': c})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    return df
#from explain_words_extraction.ipynb import word_extract
def word_extract(dataframe_context,focus_words):
    result = []
    contexts = [x for x in dataframe_context.columns]
    for index in range(len(dataframe_context.index)):
        for c in contexts:
            sentences = nltk.sent_tokenize(dataframe_context[c][index])
            for sent in sentences:
                sent = sent.lower()
                overlap = [x for x in focus_words if x in sent]
                if len(overlap)>0:
                    arc_id = df_contexts['id'][index]
                    result.append({'id':arc_id, # NOTE THAT YOU HAVE TO LOAD ARXIV DATABASE FILE AS df_arxiv!!
                                   'mainmathcat': df_arxiv.loc[df_arxiv['id'] == arc_id]['mainmathcat'].item(),
                                   'context': c, 'sentence': sent, 'focuswords': ",".join(overlap)})
    df = pd.concat([pd.DataFrame(result[i], index=['id']) for i in range(len(result))])
    df = df.set_index('id')
    return df 
# From explanations-theorems-proofs - extract rows with sentences that have a focusword
def sentence_extract(df,focuswords, contexts = None, remove_focuswords = False):
    if contexts is not None: # Focus on context
        df = df[df['context'].isin(contexts)]
    for i in df.index:
        sent = df.loc[i,'sentence']
        tokenize_sent = nltk.word_tokenize(sent)
        if any(x in focuswords for x in tokenize_sent) is remove_focuswords:
            df = df.drop(index=i)
    return df 
def adj_nouns(df, focus_noun, contexts = None): # Looks for adjectives around a noun in "noun"_raw-df files
    result = []
    df = df.reset_index()
    if contexts is not None: # Focus on context
        df = df[df['context'].isin(contexts)]
    for i in df.index:
        doc = nlp(df.loc[i,'sentence'])
        for chunk in doc.noun_chunks:
            if chunk.root.text in focus_noun:
                for token in chunk:
                    if token.pos_ in 'ADJ':
                        result.append({'id':df.loc[i,'id'],'mainmathcat': df.loc[i,'mainmathcat'],
                                       'context': df.loc[i,'context'], 'sentence': df.loc[i,'sentence'],
                                       'focuswords': df.loc[i,'focuswords'],'adjective': token.text})
    df = pd.concat([pd.DataFrame(result[i], index=[0]) for i in range(len(result))])
    #df = df.set_index('id')
    return df

In [8]:
# Taken from explain words extraction
# Load latest context and ArXiv database file
LatestContextFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_contexts*'),key=os.path.getctime)
df_contexts = pd.read_feather(LatestContextFile)

LatestDatabaseFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/arxiv_extended_database*'),key=os.path.getctime)
df_arxiv = pd.read_feather(LatestDatabaseFile)

### Looks at adjectives connected to proofs 
#### Using the adj_nouns function

In [15]:
# Creates and saves the proof_fullraw in database-files
proof_word = ['proof']
proof_raw_df = word_extract(df_contexts,proof_word) # Computational heavy
proof_raw_df = proof_raw_df.reset_index()
proof_raw_df.to_feather(path+'/database-files/proof_fullraw'+timestr+'.feather')

# Creates and saves the understa_fullraw in database-files
understa_raw_df = word_extract(df_contexts,['understa']) 
understa_raw_df = understa_raw_df.reset_index()
understa_raw_df = proof_raw_df.reset_index()
understa_raw_df.to_feather(path+'/database-files/understa_fullraw'+timestr+'.feather')


In [325]:
# Load latest understa and proof file
LatestUnderstaFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/understa_fullraw*'),key=os.path.getctime)
understa_raw_df = pd.read_feather(LatestUnderstaFile)

LatestProofFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/proof_fullraw*'),key=os.path.getctime)
proof_raw_df = pd.read_feather(LatestProofFile)

In [None]:
# Using adj_nouns and makes a dataframe of counts. 
proofadj_df = adj_nouns(proof_raw_df,contexts = ['outer','meta','other'],focus_noun = ['proof'])
count_adj = proofadj_df['adjective'].value_counts().to_frame()
count_adj.index.name = 'adjective'
count_adj = count_adj.rename(columns={'adjective': 'count'})

In [262]:
count_adj['pct'] = ((count_adj['count'] / count_adj['count'].sum())*100).round(2) 

# Picked from criteria pct>1. - HARDCODED, BUT ALSO CATEGORIES CHOSEN BY HAND
meta = ['new','alternate','same','different','alternative',
        'first','similar','above','previous','second','following']
how_adj = ['direct','complete','full','constructive','bijective','combinatorial']
what_adj = ['simple','short','detailed','elementary','simpler','rigorous','original','formal']

a = count_adj.loc[meta].sum().to_frame(name = 'meta adjectives').transpose()
b = count_adj.loc[how_adj].sum().to_frame(name = 'Obj. adjectives').transpose()
c = count_adj.loc[what_adj].sum().to_frame(name = 'Subj. adjectives').transpose()
adj_compund = a.append([b,c])

# Saves both of them to /armchair folder as dataframes
count_adj = count_adj.reset_index()
adj_compund = adj_compund.reset_index()
count_adj.to_feather(path+'/armchair/count_pct_adj_df'+timestr+'.feather')
adj_compund.to_feather(path+'/armchair/count_pct_categorized_adj_df'+timestr+'.feather')
# Saves both as tables in /tex folder
count_adj.to_latex(path+'/tex/count_adjectives_freq'+timestr+'.tex')
adj_compund.to_latex(path+'/tex/adjectives_classified_freq'+timestr+'.tex')

### Is it proofs or figures/diagrams that provides understanding?

In [3]:
# Load explanation_df
LatestExplanationFile = max(glob.iglob('/Users/Svesketerning/Google-Drev/experiments/database-files/explain_fullraw*'),key=os.path.getctime)
explanation_df = pd.read_feather(LatestExplanationFile)


In [4]:
# Times figure and proof are mentioned in explanation contexts
figure_sent_df = sentence_extract(explanation_df, focuswords = ['figure','figures'],
                                   contexts = None, remove_focuswords = False)
proof_sent_df = sentence_extract(explanation_df, focuswords = ['proof','proofs'], 
                                    contexts = None, remove_focuswords = False)

In [21]:
figure_raw_df = word_extract(df_contexts,['figure']) # All the times figure is mentioned - in order to normalize
proof_raw_df = word_extract(df_contexts,['proof'])

#### We now have everything we need - times proof/figure is mentioned in expla/understa contexts and how many times they are used in total. We can see it is more often authors uses a explanation word in regards to a proof than to a figure, but nothing is significant.

In [22]:
figure_mention = len(figure_raw_df)
proof_mention = len(proof_raw_df) # How many times proof and proofs are used - in order to normalize
figure_explanation = len(figure_sent_df)
proof_explanation = len(proof_sent_df)

In [None]:
proof_understa_sent_df = sentence_extract(understa_raw_df, focuswords = ['proof','proofs'], 
                 contexts = None, remove_focuswords = False)
figure_understa_sent_df = sentence_extract(understa_raw_df, focuswords = ['figure','figures'], 
                 contexts = None, remove_focuswords = False)
figure_understand = len(figure_understa_sent_df)
proof_understand = len(proof_understa_sent_df)

In [23]:
# Fisher exact test
oddsratio, pvalue = stats.fisher_exact([[figure_explanation, figure_mention], [proof_explanation, proof_mention]])
print('explanation',pvalue)
#oddsratio, pvalue = stats.fisher_exact([[figure_understand, figure_mention], [proof_understand, proof_mention]])
#print('understand',pvalue)

explanation 0.2350031704992473


In [26]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', None)
np.random.seed(42)
figure_sent_df#.sample(10)



Unnamed: 0,id,mainmathcat,context,sentence,focuswords
115,2009.04307,math.CV,outer,theorem can be written as follows:\n\nthe following figures (figures and ) explain numerically the result of proposition .,explain
146,2009.03215,math.AC,outer,"this includes theorem a which relates the monomial-free restricted matching field ideals with the initial ideals of schubert varieties, theorems b and c which characterize the family of binomial, zero and non-binomial ideals and theorem which is a non-inductive reformulation of theorem c. to explain these results clearly, we give examples and use figure to give a visual representation of these results.",explain
161,2009.10024,math.CT,outer,"figure 2: subbimodules of \n\n\n\n\n\n\nin section , we explained how weakly exact structures give rise to bifunctors.",explain
207,2009.13916,math.NA,outer,"an explanation comes from the analysis of the dynamic pattern representation vs the position within the dome-grid and the flux distribution, as provided in figure .",explanation
212,2009.09216,math.ST,outer,"we represent these data on the complex plane in figure , using orange triangles for the low-speed wind values as explained above.",explain
288,2009.03233,math.DG,outer,we explain this in figure .,explain
354,2009.08251,math.OC,outer,we also choose for simplified explanation and throughout the figure consistent scaling was used.,explanation
368,2009.13127,math.CV,outer,we explain how the local dynamics near\nthe singular set of we described above stitch together by studying\nthe real-analytic foliation of the sphere induced by the\nreal-time flow (figure ).,explain
411,2009.02503,math.CO,outer,the way to replace the vertices in should be clearly explained by figure .,explain
570,2009.13314,math.GT,outer,this is explained in detail in example and illustrated in figure .,explain


In [319]:
#figure_raw_df = figure_raw_df.reset_index()
#sentence_extract(proof_raw_df, focuswords = ['illustrate','illustrates','illustrating','illustrated'],
#                                   contexts = None, remove_focuswords = False)
obvious_raw_df = word_extract(df_contexts,['obvious'])
obvious_freq_df = freq_wordcount_branch(obvious_raw_df)

### Use of Obvious - Random

In [320]:
obvious_raw_df = word_extract(df_contexts,['obvious'])
obvious_freq_df = freq_wordcount_branch(obvious_raw_df)
pd.set_option('display.max_rows', 40)
obvious_freq_df.sort_values('focuswords')

Unnamed: 0,mainmathcat,focuswords
17,math.MG,83.690412
0,math.PR,93.221894
4,math.AP,94.814024
10,math.GT,95.132317
25,math.NA,95.837193
1,math.OC,111.919972
7,math.RT,112.144805
11,math.HO,112.585847
2,math.ST,113.373482
12,math.DS,116.093658


In [32]:
test = adj_nouns(explanation_df,contexts = ['outer','meta','other','proof','theorem'],focus_noun = ['explanation','explanations'])

In [33]:
test_adj = test['adjective'].value_counts().to_frame()
test_adj.index.name = 'adjective'
test_adj = test_adj.rename(columns={'adjective': 'count'})
test_adj

Unnamed: 0_level_0,count
adjective,Unnamed: 1_level_1
detailed,19
further,9
more,8
brief,8
simple,5
possible,5
geometric,4
satisfactory,4
intuitive,3
alternative,3
