In [1]:
import os
import json
from IPython.utils import io
import time
import math

import gensim
from tqdm import tqdm
import nltk
import gensim.downloader as api
import csv

from dataset import CongressDataset
from token_map import TokenMap, create_re_from_formatted_dictionary

In [2]:
from sklearnex import patch_sklearn;
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
EMBEDDING_SIZE = 300
NUM_EPOCHS = 30
DICTIONARY_SIMILARITY_THRESHOLD = 0.5
WORD_MIN_COUNT = 100
NUM_RECURRENCES = 3
USE_WIKI = False
MAINTAIN_CORE_WORDS = False

DICT_ID = "no_wiki_continuous_v1"
DICTIONARY_SAVE_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "json_dicts")
os.makedirs(DICTIONARY_SAVE_PATH, exist_ok=True)
MODELS_SAVE_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "embedding_models")
os.makedirs(MODELS_SAVE_PATH, exist_ok=True)
PRETRAINED_MODEL_NAME = "pretrained_wiki_model.model"

In [4]:
# Dictionary Fine Tuning
LARGE_CLUSTER_WEIGHT = 0.02
NUM_ENSAMBLE_RERUNS = 1
# NUM_CLUSTERS_TO_CHECK = range(2, 13)
# NUM_CLUSTER_RERUNS = 20

PREVIOUS_TERM_BOOST = 3
NUM_RECURRENCE_BOOST = 0.03

In [5]:
FORMATTED_WIKI_TOKENS_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "formatted_wiki_unique_tokens_corpus.json")
if not os.path.exists(FORMATTED_WIKI_TOKENS_PATH):

    wiki_corpus = api.load("wiki-english-20171001")

    unique_tokens = set()
    formatted_wiki_data = []
    for thing in tqdm(wiki_corpus):
        for section in thing["section_texts"]:
            sentences = nltk.sent_tokenize(section)
            for sentence in sentences:
                words = nltk.word_tokenize(sentence)
                for word in words:
                    unique_tokens.add(word)

    wiki_json_format = {"tokens": list(unique_tokens)}

    del wiki_corpus

    with open(FORMATTED_WIKI_TOKENS_PATH, "w") as f:
        json.dump(wiki_json_format, f)
else:
    with open(FORMATTED_WIKI_TOKENS_PATH, "r") as f:
        wiki_json_format = json.load(f)
    unique_tokens = wiki_json_format["tokens"]

del wiki_json_format

In [6]:
unique_tokens = []

In [7]:
DATA_PATH = os.path.join(
        os.path.curdir, "data", "cr_speech_sentences_with_speaker_and_date.csv"
    )
FORMATTED_DICT_PATH = os.path.join(
    os.path.curdir,
    "data",
    DICT_ID, 
    "hand_curated", 
    "formatted_hand_curated_dict.json"
)

CORE_TERMS_PATH = os.path.join(os.path.curdir,
    "data",
    DICT_ID, 
    "hand_curated", 
    "core_terms.json"
)
token_map_load_path = os.path.join(os.path.curdir, "data", DICT_ID, "token_map")

dictionary_re = create_re_from_formatted_dictionary(FORMATTED_DICT_PATH)

token_map = TokenMap(DATA_PATH, token_map_load_path, dictionary_re=dictionary_re, additional_corpus=unique_tokens)



Loading token map from disk...


In [8]:
# Store transformed WIKI CORPUS

TRANSFORMED_WIKI_CORPUS_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "transformed_wiki_corpus.csv")

if not os.path.exists(TRANSFORMED_WIKI_CORPUS_PATH):
    with open(TRANSFORMED_WIKI_CORPUS_PATH, "w") as f:
        writer = csv.writer(f)

        wiki_corpus = api.load("wiki-english-20171001")

        for thing in tqdm(wiki_corpus):
            for section in thing["section_texts"]:
                sentences = nltk.sent_tokenize(section)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    writer.writerow([token_map.get_token_id_from_token(word.lower()) for word in words])



In [9]:
pretrained_model_path = os.path.join(MODELS_SAVE_PATH, PRETRAINED_MODEL_NAME)

if os.path.exists(pretrained_model_path):
    print("Loading pretrained model")
    model = gensim.models.Word2Vec.load(pretrained_model_path)
else:
    transformed_wiki_corpus = []
    # wiki_corpus = api.load("wiki-english-20171001")
    MAX_WIKI_TRAIN_SIZE = 10_000_000
    model = None

    transformed_wiki_data = []
    train_size = 0
    with open(TRANSFORMED_WIKI_CORPUS_PATH, "r") as f:
        reader = csv.reader(f)
        for i, row in tqdm(enumerate(reader)):
            transformed_wiki_data.append([int(token_id) for token_id in row])

            if train_size >= MAX_WIKI_TRAIN_SIZE:
                if model == None:
                    print("Training model")
                    model = gensim.models.Word2Vec(transformed_wiki_data, workers=12, vector_size=EMBEDDING_SIZE, window=5, min_count=WORD_MIN_COUNT)
                    model.save(pretrained_model_path)

                else:
                    print("Updating model")
                    model.build_vocab(transformed_wiki_corpus, update=True)
                    model.train(transformed_wiki_corpus, total_examples=len(transformed_wiki_corpus), epochs=NUM_EPOCHS)
                    model.save(pretrained_model_path)

                train_size = 0
                transformed_wiki_data = []
            train_size += 1

Loading pretrained model


In [10]:
dictionary_time_periods = []

start_year = 1873
end_year = 2000


temp_year = 1873
while temp_year < end_year - 1:
    dictionary_time_periods.append((temp_year, temp_year + 19))
    temp_year += 20
dictionary_time_periods = sorted(dictionary_time_periods, reverse=True)

dictionary_time_periods[0] = (1993, 2000)
dictionary_time_periods = [(2001, 2020)] + dictionary_time_periods

print(len(dictionary_time_periods))
print(dictionary_time_periods[-10:])

8
[(2001, 2020), (1993, 2000), (1973, 1992), (1953, 1972), (1933, 1952), (1913, 1932), (1893, 1912), (1873, 1892)]


In [11]:
# Load the starting dictionaries
with open(FORMATTED_DICT_PATH, "r") as f:
    curr_dicts = json.load(f)

with open(CORE_TERMS_PATH, "r") as f:
    core_terms = json.load(f)
    core_terms = set(core_terms["core"])

print(core_terms)


translated_core_terms = set()
for core_term in core_terms:
    translated_core_terms.add(token_map.get_token_id_from_token(core_term))

print(translated_core_terms)


{'discrimination', 'judicious', 'chaste', 'poverty', 'obligation', 'forgive', 'suffering', 'cheat', 'obedience', 'defiance', 'disloyal', 'collegial', 'patriotism', 'biased', 'prejudiced', 'disrespect', 'patriot', 'hated', 'belong', 'ethical', 'honest', 'honesty', 'untrustworthy', 'dishonorable', 'compliance', 'crime', 'prideful', 'alienated', 'unfair', 'equity', 'lie', 'American', 'equality', 'belonging', 'authoritative', 'suffered', 'disrespectful', 'corruption', 'illegitimate', 'cheated', 'virtuous', 'pornographic', 'treason', 'lusty', 'agitated', 'forgiven', 'member', 'modesty', 'hateful', 'sexual', 'disrespected', 'impartially', 'membership', 'duty', 'betray', 'suffer', 'impartiality', 'disloyalty', 'pervert', 'partisan', 'leader', 'party', 'wise', 'lust', 'disobey', 'undemocratic', 'betrayal', 'prejudice', 'charity', 'segregated', 'patriotic', 'hurt', 'pornography', 'nation', 'obeyed', 'traitors', 'attacked', 'abuse', 'defy', 'virtue', 'graft', 'compassion', 'belongs', 'authority'

In [12]:
def format_data_for_gensim(data_item):
    return data_item["sentence"]

In [13]:
def calculate_average_sim_score(word, dictionary_terms, model):
    tot_sim_score = 0
    num_valid_terms = 0

    for term in dictionary_terms:
        if term != word:
            try:
                specific_score = model.wv.similarity(word, term)
                tot_sim_score += specific_score
                num_valid_terms += 1
            except KeyError as e:
                pass

    return tot_sim_score / num_valid_terms

In [14]:
def calculate_median_sim_score(word, dictionary_terms, model):
    all_sim_scores = []

    for term in dictionary_terms:
        if term != word:
            try:
                specific_score = model.wv.similarity(word, term)
                all_sim_scores.append(specific_score)
            except KeyError as e:
                #os.path.join("data", "ensamble_fine_tuning", "original_dict.json")
                pass

    all_sim_scores = sorted(all_sim_scores)
    median_score = all_sim_scores[len(all_sim_scores) // 2]
    if word in dictionary_terms:
        # Boost the score of words that are already in the dictionary
        # print(f"Boosting {token_map.get_token_from_id(word)} from {median_score} to {median_score * (1 + PREVIOUS_TERM_BOOST)} because it is already in the dictionary")
        median_score = median_score * (1 + PREVIOUS_TERM_BOOST)

    return median_score

In [15]:
print(not USE_WIKI)

True


In [16]:
def calculate_dictionary(model, dictionary_terms, previous_dictionary_size: int, core_terms: set):
     new_term_counts = {}
     new_term_scores = {}

     for term in dictionary_terms:
          try:
               top_matching_words = model.wv.most_similar(term, topn=previous_dictionary_size)

               if not USE_WIKI:
                    top_matching_words = [(word, sim_score) 
                                         for word, sim_score in top_matching_words
                                         if not token_map.token_id_unique_to_additional_corpus(word)]
                    if len(top_matching_words) < previous_dictionary_size:
                         print(f"We removed {previous_dictionary_size - len(top_matching_words)} words.")

               for (word, sim_score) in top_matching_words:
                    # Keep track of max sim score for each word 0

                    
                    if not (word in new_term_scores):
                         # Only needs to be calculated once
                         # average_sim_score = calculate_average_sim_score(word, dictionary_terms, model)
                         median_score = calculate_median_sim_score(word, dictionary_terms, model)
                         new_term_scores[word] = median_score
                    else:
                         # Boost the score of words that appear more times
                         # new_term_scores[word] = new_term_scores[word] * (1 + NUM_RECURRENCE_BOOST)
                         original_score = new_term_scores[word]
                         # median_score = calculate_median_sim_score(word, dictionary_terms, model)
                         new_term_scores[word] = original_score * (1 + NUM_RECURRENCE_BOOST)
                         

                    # Keep track of how many times a word appears in the top matching words
                    # if sim_score > DICTIONARY_SIMILARITY_THRESHOLD:
                    #      if word not in new_term_counts:
                    #           new_term_counts[word] = 0
                    #      new_term_counts[word] += 1
                         
          except KeyError as e:
               translated_term = token_map.get_token_from_id(term)
               # print(f"{translated_term}-{term}: {e}")

     # print("Previous dictionary reoccurrences:")
     # print({k: v for k, v in sorted(previous_dict_reocurrences.items(), key=lambda item: item[0])})
     # Sort dictionaries and remove words that don't appear enough times
     # new_term_counts = {k: v for k, v in sorted(new_term_counts.items(), key=lambda item: item[1], reverse=True) if v >= NUM_RECURRENCES}     
     new_term_scores = {k: v for k, v in sorted(new_term_scores.items(), key=lambda item: item[1], reverse=True)}


     # Compile the dictionary from the recurring words
     final_dictionary_scores = {}
     final_dictionary_terms = set()
     if MAINTAIN_CORE_WORDS:
          core_terms_present = set()
          for term in dictionary_terms:
               if term in core_terms:
                    core_terms_present.add(term)
                    final_dictionary_scores[term] = math.inf
          final_dictionary_terms = core_terms_present
     # for term in new_term_counts.keys():
     #      if len(final_dictionary_terms) >= previous_dictionary_size:
     #           break
     #      final_dictionary_terms.add(term)

     # Add words based on similarity score until the dictionary is full
     for term, score in new_term_scores.items():
          if term in final_dictionary_terms:
               continue
          
          if len(final_dictionary_terms) >= previous_dictionary_size:
               break
          final_dictionary_terms.add(term)
          final_dictionary_scores[term] = score
     
     # Remove the unknown token if it is in the dictionary
     # The unkown token is meaningless
     if token_map.unkown_id in final_dictionary_terms:
          final_dictionary_terms.remove(token_map.unkown_id)
          final_dictionary_scores.pop(token_map.unkown_id)

     for term in final_dictionary_terms:
          if token_map.get_token_from_id(term) == token_map.unkown_token:
               print("Removed unknown token from dictionary")
               final_dictionary_scores.pop(term)
               final_dictionary_terms.remove(term)

     # print(f"Previous dictionary reocurrences: {sorted([token_map.get_token_from_id(term) for term in final_dictionary_terms if term in dictionary_terms]) }")
     # print(f"Average dictionary_sim_score for with previous boost: {sum(final_dictionary_scores.values()) / len(final_dictionary_scores)}")
     # print(f"Average score of top 25 terms: {sum(sorted(final_dictionary_scores.values(), reverse=True)[:25]) / 25}. There are {len(final_dictionary_scores)} terms in the dictionary.")

     return list(final_dictionary_terms), final_dictionary_scores

In [17]:
def convert_list_of_scores_to_final_scores(list_scores_dict):
    """
    Returns a list of terms translated to english sorted by their values in descending order
    """
    final_scores = {}

    total_num_recurrences = 0
    for term, score_lists in list_scores_dict.items():
        num_recurrences = len(score_lists)
        if num_recurrences > 1:
            total_num_recurrences += 1

        # Each term with multiple occurences gets a percentage boost over the average score
        # TODO: Consider using the median here instead of the average
        # if num_recurrences > 1:
        #     print(f"Term {token_map.get_token_from_id(term)} has {num_recurrences} recurrences")
    
        final_scores[term] = (sum(score_lists) / num_recurrences) * (1 + (num_recurrences * NUM_RECURRENCE_BOOST))
        
    final_dictionary = [token_map.get_token_from_id(k) for k, v in sorted(final_scores.items(), key=lambda item: item[1], reverse=True)]
    final_scores = [v for k, v in sorted(final_scores.items(), key=lambda item: item[1], reverse=True)]
    return final_dictionary, final_scores

In [18]:
def combine_ensamble_dicts(ensamble_dicts, desired_dict_sizes):
    combined_dict = {}

    for moral_foundation, dict_terms in ensamble_dicts.items():
        # Convert list of scores to final scores
        sorted_terms, sorted_scores = convert_list_of_scores_to_final_scores(dict_terms)
        final_dict_terms = sorted_terms[:desired_dict_sizes[moral_foundation]]
        final_dict_scores = sorted_scores[:desired_dict_sizes[moral_foundation]]

        # print(f"Final dictionary size for {moral_foundation}: {len(final_dict_terms)}")
        # print(f"Average score for {moral_foundation}: {sum(final_dict_scores) / len(final_dict_scores)}")

        combined_dict[moral_foundation] = final_dict_terms

    return combined_dict

In [19]:
for i, time_period in enumerate(dictionary_time_periods):
    term_counts_and_scores_by_dictionary = {}
    desired_dict_sizes = {}
    num_trained_models = 0
    tot_train_time = 0
    print(f"The current time period is {time_period}.  {i+1}/{len(dictionary_time_periods)}")
    model_save_path = os.path.join(MODELS_SAVE_PATH, f"{time_period[0]}_{time_period[1]}.model")

    if not os.path.exists(model_save_path):
        # Load the dataset
        middle_date = (time_period[0] + time_period[1]) // 2
        first_time_period = (time_period[0], middle_date)
        second_time_period = (middle_date, time_period[1])
        dataset = CongressDataset(token_map=token_map, date_range=first_time_period)
        dataset.map(format_data_for_gensim)

        # start_time = time.time()
        model.build_vocab(dataset.data, update=True)
        model.train(dataset.data, epochs=NUM_EPOCHS, total_examples=len(dataset.data))

        dataset = CongressDataset(token_map=token_map, date_range=second_time_period)
        dataset.map(format_data_for_gensim)

        # start_time = time.time()
        model.build_vocab(dataset.data, update=True)
        model.train(dataset.data, epochs=NUM_EPOCHS, total_examples=len(dataset.data))
        # model = gensim.models.Word2Vec(dataset.data, vector_size=EMBEDDING_SIZE, window=5, min_count=WORD_MIN_COUNT, workers=12)
        # tot_train_time += time.time() - start_time
        model.save(model_save_path)
        del dataset
    else:
        model = gensim.models.Word2Vec.load(model_save_path)

    for dict_name in curr_dicts.keys():
        if dict_name not in term_counts_and_scores_by_dictionary:
            term_counts_and_scores_by_dictionary[dict_name] = {}

        # grab existing terms
        dict_terms = curr_dicts[dict_name]
        desired_dict_sizes[dict_name] = len(dict_terms)
        translated_terms = [token_map.get_token_id_from_token(term) for term in dict_terms if token_map.get_token_id_from_token(term) in model.wv]
        
        dictionary_embeddings = [embedding for embedding in model.wv[translated_terms]]

        # Calculate the centroids of the clusters
        # num_clusters, cluster_labels = calculate_cluster_centroids(dictionary_embeddings)
        # cluster_distributions = calculate_cluster_distributions(cluster_labels, num_clusters)
        # print(cluster_distributions)
        # translated_terms_and_labels = list(zip(translated_terms, cluster_labels))

        # Calculate the new dictionary terms
        new_terms, new_term_scores = calculate_dictionary(model, translated_terms, len(dict_terms), translated_core_terms)

        for term, score in new_term_scores.items():
            if term not in term_counts_and_scores_by_dictionary[dict_name]:
                term_counts_and_scores_by_dictionary[dict_name][term] = []
            term_counts_and_scores_by_dictionary[dict_name][term].append(score)

    curr_dicts = combine_ensamble_dicts(term_counts_and_scores_by_dictionary, desired_dict_sizes)

    # Save the new completed dictionary
    new_dict_name = f"{time_period[0]}-{time_period[1]}_recursive_dict.json"
    new_dict_path = os.path.join(DICTIONARY_SAVE_PATH, new_dict_name)
    with open(new_dict_path, "w+") as f:
        json.dump(curr_dicts, f)

The current time period is (2001, 2020).  1/8
The current time period is (1993, 2000).  2/8
The current time period is (1973, 1992).  3/8
The current time period is (1953, 1972).  4/8
The current time period is (1933, 1952).  5/8
The current time period is (1913, 1932).  6/8
The current time period is (1893, 1912).  7/8
The current time period is (1873, 1892).  8/8
