In [1]:
import os
import json
from IPython.utils import io

import nltk
from tqdm import tqdm
import gensim
from gensim.models import Word2Vec

from dataset import CongressDataset

EMBEDDING_SIZE = 300
DICTIONARY_MAX_SIZE = 100
DICTIONARY_MIN_SIZE = 20
DICTIONARY_SIMILARITY_THRESHOLD = 0.5

MODEL_SAVE_PATH = os.path.join("data", "embedding_models")
DICTIONARY_SAVE_PATH = os.path.join("data", "dictionaries")

In [2]:
moral_foundation_words = ["care", "fairness", "authority", "loyalty", "purity"]

In [3]:
dictionary_time_periods = [(2007, 2016), (2001, 2006)]

start_year = 1873
end_year = 2001


temp_year = 1873
while temp_year < end_year:
    dictionary_time_periods.append((temp_year, temp_year + 3))
    temp_year += 4

print(len(dictionary_time_periods))
print(dictionary_time_periods[-10:])


34
[(1961, 1964), (1965, 1968), (1969, 1972), (1973, 1976), (1977, 1980), (1981, 1984), (1985, 1988), (1989, 1992), (1993, 1996), (1997, 2000)]


In [4]:
def format_data_for_gensim(data_item):
    return data_item["sentence"]

In [5]:
def calculate_dictionary(model):
     moral_foundation_dictionary = {}

     for moral_foundation in moral_foundation_words:
          top_matching_words = model.wv.most_similar(moral_foundation, topn=DICTIONARY_MAX_SIZE)

          # Filter out words that dont meet the cutoff
          final_dictionary_words = []
          for (word, sim_score) in top_matching_words:
               if sim_score > DICTIONARY_SIMILARITY_THRESHOLD:
                    final_dictionary_words.append(word)

          if len(final_dictionary_words) < DICTIONARY_MIN_SIZE:
               dictionary_words_and_scores = model.wv.most_similar(moral_foundation, topn=DICTIONARY_MIN_SIZE)
               final_dictionary_words = [word for (word, sim_score) in dictionary_words_and_scores]

          moral_foundation_dictionary[moral_foundation] = final_dictionary_words
          

     return moral_foundation_dictionary

In [6]:
# dataset = CongressDataset(date_range=dictionary_time_periods[3])
# dataset.map(format_data_for_gensim)

In [7]:
# time_period = dictionary_time_periods[3]
# model = gensim.models.Word2Vec(dataset.data, vector_size=EMBEDDING_SIZE, window=5, min_count=80)
# model_save_path = os.path.join(MODEL_SAVE_PATH, f"{time_period[0]}_{time_period[1]}.model")
# model.save(model_save_path)   

In [8]:
# moral_dictionary = calculate_dictionary(model)
# dictionary_save_path = os.path.join(DICTIONARY_SAVE_PATH, f"{time_period[0]}_{time_period[1]}.json")
# with open(dictionary_save_path, "w+") as f:
#     json.dump(moral_dictionary, f)

In [9]:
time_period_index = 0
for time_period in dictionary_time_periods:
    dictionary_save_path = os.path.join(DICTIONARY_SAVE_PATH, f"{time_period[0]}_{time_period[1]}.json")
    model_save_path = os.path.join(MODEL_SAVE_PATH, f"{time_period[0]}_{time_period[1]}.model")

    if not os.path.exists(dictionary_save_path) or not os.path.exists(model_save_path):
        print(f"Creating dictionary for {time_period[0]} - {time_period[1]}. {time_period_index} / {len(dictionary_time_periods)}")
        with io.capture_output() as captured:
            dataset = CongressDataset(date_range=time_period)
            dataset.map(format_data_for_gensim)

        model = gensim.models.Word2Vec(dataset.data, vector_size=EMBEDDING_SIZE, window=5, min_count=50)
        model.save(model_save_path)

        moral_dictionary = calculate_dictionary(model)
        with open(dictionary_save_path, "w+") as f:
            json.dump(moral_dictionary, f)

    time_period_index += 1

Creating dictionary for 2007 - 2016. 3 / 34
