In [1]:
import pandas as pd
import numpy as np
from mc4.algorithm import mc4_aggregator
import spacy
from spacy.tokens import Doc
from os.path import exists
import pickle



In [2]:
class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(" ")
        spaces = [True] * len(words)
        # Avoid zero-length tokens
        for i, word in enumerate(words):
            if word == "":
                words[i] = " "
                spaces[i] = False
        # Remove the final trailing space
        if words[-1] == " ":
            words = words[0:-1]
            spaces = spaces[0:-1]
        else:
            spaces[-1] = False
            
        return Doc(self.vocab, words=words, spaces=spaces)

In [3]:
nlp = spacy.blank("en")
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
nlp.initialize()
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
nlp.max_length = 5000000

In [4]:
year_list = [
    '1900_1909',
    '1910_1919',
    '1920_1929',
    '1930_1939',
    '1940_1949',
    '1950_1959',
    '1960_1969',
    '1970_1979',
    '1980_1989',
    '1990_1999',
    '2000_2009',
    '2010_2020'
]

In [5]:
country_list = ['china',
                'north korea',
                'south korea',
                'canada',
                'united kingdom',
                'germany']

In [6]:
concept_list = ['autocracy',
                'autocratic',
                'dictator',
                'dictatorship',
               'authoritarianism',
               'democracy']

In [7]:
def combine_and_clean_dfs(clust_df, term_df, nlp):
    df = pd.DataFrame()
    df['cluster'] = clust_df.idxmax(axis=1)
    df['prob'] = clust_df.max(axis=1)
    df = df.merge(term_df, how='left', left_index=True, right_index=True)
    df.top_word = df.top_word.replace(np.nan, 'and', regex=True)
    
    top_word = df.top_word.tolist()
    word_weight = df.word_weight.tolist()
    
    doc = nlp(' '.join(top_word))
    clean_words = [token.lemma_ if not token.is_stop and len(token)>2 else '' for token in doc]
    clean_weights = [x if clean_words[i] != '' else 0.0 for i,x in enumerate(word_weight)]
    df['top_word'],df['word_weight'] = clean_words, clean_weights
    
    return df

In [8]:
def gather_cluster_dfs(df):
    head = 20
    df_dict = {'full_df':df, 'clusters':{}}
    
    df = df.groupby('cluster').agg({'prob':'mean', 'top_word':list, 'word_weight':[list,np.max,np.argmax]})
    df.rename(columns={'top_word': 'top_words'},inplace=True)
    
    top_word = []
    top20_weights = []
    top20_words = []
    for index, row in df.iterrows():
        top_word.append(row.top_words['list'][int(row.word_weight['argmax'])])
        top20_idx = np.argsort(row.word_weight['list'])[-head:]
        top20_weights.append([row.word_weight['list'][i] for i in top20_idx])
        top20_words.append([row.top_words['list'][i] for i in top20_idx])        
    df['top_word'], df['top_20_words'], df['top_20_weights'] = top_word, top20_words, top20_weights
            
    top_words_lists = df.top_words['list'].tolist()
    word_weight_lists = df.word_weight['list'].tolist()
    clusters = df.index.tolist()
    mean_prob = df.prob['mean'].tolist()
    
    for i, clust in enumerate(clusters):
        df_dict['clusters'][clust] = {}
        df_dict['clusters'][clust]['mean_prob'] = mean_prob[i]
        df_dict['clusters'][clust]['clust_size'] = len(top_words_lists[i])
        
        tdf = pd.DataFrame()
        tdf['word'] = top_words_lists[i]
        tdf['weight'] = word_weight_lists[i]
        tdf = tdf.groupby('word').agg({'weight':[sum,'mean', 'count']}).reset_index()
        tdf = tdf[tdf.word != '']
        
        top20_list = df.iloc[i]['top_20_words'].values[0]
        top20_weight = df.iloc[i]['top_20_weights'].values[0]
        top20_list = list(zip(*sorted(zip(top20_weight,top20_list), reverse=True)))[1]
        
        top_df = pd.DataFrame()
        top_df['top_20_att'] = top20_list[:len(tdf)]
        top_df['top_20_sum'] = tdf.sort_values(by = ('weight', 'sum'), ascending = False).head(head)['word'].tolist()
        top_df['top_20_mean'] = tdf.sort_values(by = ('weight', 'mean'), ascending = False).head(head)['word'].tolist()
        top_df['top_20_count'] = tdf.sort_values(by = ('weight', 'count'), ascending = False).head(head)['word'].tolist()
        
        df_dict['clusters'][clust]['top_20_df'] = top_df
        
    return df_dict

In [11]:
def gather_all_cluster_words(country_list, concept_list, year_list, nlp):
    country_dict = {}
    concept_dict = {}
    concept_path_att = './crs_attention_words/concepts/'
    concept_path_bgmm = './crs_cluster_csvs/bgmm/concepts/'
    country_path_att = './crs_attention_words/countries/'
    country_path_bgmm = './crs_cluster_csvs/bgmm/countries/'
    
    for country in country_list:
        country_dict[country] = {}
        for year in year_list:
            if exists(country_path_att + country + '/' + country + '_' + year + '.csv') and exists(country_path_bgmm + country + '/' + country + '_' + year + '.csv'):
                terms_df = pd.read_csv(country_path_att + country + '/' + country + '_' + year + '.csv')
                bg_df = pd.read_csv(country_path_bgmm + country + '/' + country + '_' + year + '.csv')
                df = combine_and_clean_dfs(bg_df, terms_df, nlp)
                term_dict = gather_cluster_dfs(df)
                country_dict[country][year] = term_dict
            
    for concept in concept_list:
        concept_dict[concept] = {}
        for year in year_list:
            if exists(concept_path_att + concept + '/' + concept + '_' + year + '.csv') and exists(concept_path_bgmm + concept + '/' + concept + '_' + year + '.csv'):
                terms_df = pd.read_csv(concept_path_att + concept + '/' + concept + '_' + year + '.csv')
                bg_df = pd.read_csv(concept_path_bgmm + concept + '/' + concept + '_' + year + '.csv')
                df = combine_and_clean_dfs(bg_df, terms_df, nlp)
                term_dict = gather_cluster_dfs(df)
                concept_dict[concept][year] = term_dict
                
    return country_dict, concept_dict

In [12]:
country_dict, concept_dict = gather_all_cluster_words(country_list, concept_list, year_list, nlp)

In [13]:
# LOAD CONCEPT EMBEDS

with open('./crs_embeds/concept_embeds.pkl', 'rb') as handle:
    concept_embeddings = pickle.load(handle)

In [14]:
# LOAD COUNTRY EMBEDS

with open('./crs_embeds/country_embeds.pkl', 'rb') as handle:
    country_embeddings = pickle.load(handle)

In [15]:
def aggregate_cluster_embeds(term_dict, term_list, term_embeds, year_list):
    agg_dict = {}
    for term in term_list:
        agg_dict[term] = {}
        print(term)
        for year in year_list:
            if year in term_dict[term]:
                agg_dict[term][year] = {}
                df = term_dict[term][year]['full_df']
                embed_list = term_embeds[term][year]
                clust_nums = set(df['cluster'].tolist())
                for clust in clust_nums:
                    embed_idxs = df.index[df['cluster'] == clust].tolist()
                    clust_embeds = np.array(embed_list)[embed_idxs]
                    agg_dict[term][year][clust] = {}
                    agg_dict[term][year][clust]['cluster_mean'] = clust_embeds.mean(axis=0)
                
    return agg_dict
                

In [16]:
agg_country_clust = aggregate_cluster_embeds(country_dict, country_list, country_embeddings, year_list)

china
north korea
south korea
canada
united kingdom
germany


In [17]:
agg_concept_clust = aggregate_cluster_embeds(concept_dict, concept_list, concept_embeddings, year_list)

autocracy
autocratic
dictator
dictatorship
authoritarianism
democracy


In [21]:
def aggregate_all_embeds(term_list, term_embeds, year_list):
    agg_dict = {}
    for term in term_list:
        agg_dict[term] = {}
        for year in year_list:
            if year in term_embeds[term]:
                agg_dict[term][year] = {}
                embed_list = term_embeds[term][year]
                try: 
                    #embed_list.size:
                    agg_dict[term][year] = np.nanmean(np.array(embed_list), axis=0)
                except:
                    print(term + ': ' + year + ' not found')
    return agg_dict

In [22]:
agg_country = aggregate_all_embeds(country_list, country_embeddings, year_list)

north korea: 1900_1909 not found
north korea: 1910_1919 not found
north korea: 1920_1929 not found
north korea: 1930_1939 not found
south korea: 1900_1909 not found
south korea: 1910_1919 not found
south korea: 1920_1929 not found
south korea: 1930_1939 not found


In [23]:
agg_concept = aggregate_all_embeds(concept_list, concept_embeddings, year_list)

dictatorship: 1910_1919 not found
authoritarianism: 1900_1909 not found
authoritarianism: 1910_1919 not found


In [26]:
def add_cluster_terms(term_dict, agg_clust_dict, term_list, year_list):
    for term in term_list:
        for year in year_list:
            if year in term_dict[term]:
                for key, value in term_dict[term][year]['clusters'].items():
                    agg_clust_dict[term][year][key]['clust_size'] = value['clust_size']
                    agg_clust_dict[term][year][key]['mean_prob'] = value['mean_prob']
                    df = value['top_20_df']

                    sums = df['top_20_sum'].tolist()
                    means = df['top_20_mean'].tolist()
                    counts = df['top_20_count'].tolist()

                    all_terms = list(set(sums+means+counts))
                    df2 = pd.DataFrame(index=all_terms, columns = ['sums','means','counts'])

                    for term2 in zip(sums,means,counts):
                        df2.loc[term2[0]]['sums'] = sums.index(term2[0])
                        df2.loc[term2[1]]['means'] = means.index(term2[1])
                        df2.loc[term2[2]]['counts'] = counts.index(term2[2])
                    if len(df) > 0:
                        agg_ranks = mc4_aggregator(df2, header_row=0, index_col=0)
                        agg_ranks = dict(sorted(agg_ranks.items(), key=lambda item: item[1]))
                        agg_clust_dict[term][year][key]['term_ranks'] = agg_ranks
                    
    return agg_clust_dict

In [27]:
agg_concept_terms = add_cluster_terms(concept_dict, agg_concept_clust, concept_list, year_list)

In [28]:
agg_country_terms = add_cluster_terms(country_dict, agg_country_clust, country_list, year_list)

In [29]:
# SAVE COUNTRY YEAR AGG

with open('./crs_agg_vectors/country_year_agg.pkl', 'wb') as handle:
    pickle.dump(agg_country, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
# SAVE CONCEPT YEAR AGG

with open('./crs_agg_vectors/concept_year_agg.pkl', 'wb') as handle:
    pickle.dump(agg_concept, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [31]:
# SAVE COUNTRY CLUST AGG

with open('./crs_agg_vectors/country_clust_agg.pkl', 'wb') as handle:
    pickle.dump(agg_country_terms, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [32]:
# SAVE CONCEPT CLUST AGG

with open('./crs_agg_vectors/concept_clust_agg.pkl', 'wb') as handle:
    pickle.dump(agg_concept_terms, handle, protocol=pickle.HIGHEST_PROTOCOL)