In [1]:
import pandas as pd

In [2]:
import numpy as np

In [36]:
from importlib.machinery import SourceFileLoader

tools = SourceFileLoader("tools", "../lda_tools.py").load_module()

from tools import write_to_file

In [4]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

In [29]:
data = df[['abstract', 'unique_key', 'title', 'date']]

In [31]:
data = data.drop_duplicates()

In [32]:
data = data.reset_index(drop=True)

In [33]:
year = 2018

In [40]:
from nltk.corpus import stopwords
import gensim.corpora as corpora
stop_words = stopwords.words('english')

In [41]:
yearly_data = data[data['date'] <= int(year)]

yearly_words = list(tools.sentences_to_words(yearly_data['abstract'].values))
yearly_lemmatized_words = tools.clean_words(yearly_words, stop_words)

dictionary = corpora.Dictionary(yearly_lemmatized_words)
yearly_corpus = [dictionary.doc2bow(text) for text in yearly_lemmatized_words]

In [44]:
mallet_path = '/Users/storm/rsc/mallet-2.0.8/bin/mallet'

In [45]:
limit = 8
start = 6
step= 1

In [46]:
model_list, coherence_values = tools.compute_coherence_values(limit=limit,
                                                            mallet_path=mallet_path,
                                                            dictionary=dictionary,
                                                            corpus=yearly_corpus,
                                                            texts=yearly_lemmatized_words,
                                                            step=step, start=start)

In [53]:
len(df['abstract'].unique())

2303

In [62]:
years = sorted(df.date.unique())

In [63]:
periods = np.linspace(min(years), max(years), 10)

In [67]:
tables = []
for year in periods[2:]:
    topic_in_year = pd.read_csv(f'../data/topics_up_to_{int(year)}.csv')
    
    number_of_topics = len(topic_in_year['Dominant_Topic'].unique())
    
    topic_counts = topic_in_year[['Dominant_Topic']]['Dominant_Topic'].value_counts()
    
    topic_contribution = round(topic_counts / topic_counts.sum(), 4)
    
    table = pd.DataFrame([topic_counts.index,
                          topic_counts,
                          topic_contribution,
                          topic_in_year['Topic_Keywords'].unique()]).T
    table.columns = ['Dominant_Topic', 'Num of Documents', 'Percentage of Documents', 'Topic_Keywords']
    table['Period'] = f'1951-{int(year)}'
    table['Dominant_Topic'] = range(number_of_topics)
    
    tables.append(table)

In [68]:
table = pd.concat(tables).reset_index(drop=True)

In [69]:
table['Period'].unique()

array(['1951-1965', '1951-1973', '1951-1980', '1951-1988', '1951-1995',
       '1951-2003', '1951-2010', '1951-2018'], dtype=object)

In [70]:
table[['Topic_Keywords', 'Dominant_Topic', 'Num of Documents', 'Percentage of Documents', 'Period']]

Unnamed: 0,Topic_Keywords,Dominant_Topic,Num of Documents,Percentage of Documents,Period
0,"problem, technology, divert, euler, subsystem,...",0,3,0.375,1951-1965
1,"interpret, requirement, programme, evolution, ...",1,2,0.25,1951-1965
2,"equipment, agency, conjecture, development, un...",2,1,0.125,1951-1965
3,"variation, celebrated, trend, untried, change,...",3,1,0.125,1951-1965
4,"give, good, modern, trace, technique, ambiguit...",4,1,0.125,1951-1965
...,...,...,...,...,...
66,"behavior, social, human, decision, study, expe...",1,482,0.1951,1951-2018
67,"individual, group, good, social, punishment, l...",2,428,0.1733,1951-2018
68,"game, strategy, player, agent, play, dilemma, ...",3,380,0.1538,1951-2018
69,"population, evolutionary, dynamic, model, sele...",4,351,0.1421,1951-2018


In [71]:
with pd.option_context("max_colwidth", 1000):
    write_to_file(metric=table[['Period', 'Dominant_Topic','Topic_Keywords', 'Num of Documents',
                         'Percentage of Documents']].to_latex(index=False),
                  filename="topics_per_year_table.tex")