In [19]:
import os
import json
from IPython.utils import io
import time

import gensim
from tqdm import tqdm

from dataset import CongressDataset
from token_map import TokenMap, create_re_from_formatted_dictionary

In [20]:
from sklearnex import patch_sklearn;
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [21]:
EMBEDDING_SIZE = 300
DICTIONARY_SIMILARITY_THRESHOLD = 0.5
WORD_MIN_COUNT = 100
NUM_RECURRENCES = 3

DICTIONARY_SAVE_PATH = os.path.join(os.path.curdir, "data", "ensamble_fine_tuning", "json_dicts")
os.makedirs(DICTIONARY_SAVE_PATH, exist_ok=True)
MODELS_SAVE_PATH = os.path.join(os.path.curdir, "data", "ensamble_cluster_smart_dicts", "embedding_models")
os.makedirs(MODELS_SAVE_PATH, exist_ok=True)

In [22]:
# Dictionary Fine Tuning
LARGE_CLUSTER_WEIGHT = 0.02
NUM_ENSAMBLE_RERUNS = 20
NUM_CLUSTERS_TO_CHECK = range(2, 13)
NUM_CLUSTER_RERUNS = 20

PREVIOUS_TERM_BOOST = 3
NUM_RECURRENCE_BOOST = 0.03

In [23]:
DATA_PATH = os.path.join(
        os.path.curdir, "data", "cr_speech_sentences_with_speaker_and_date.csv"
    )
FORMATTED_DICT_PATH = os.path.join(
    os.path.curdir,
    "data",
    "revised_normalized_smart_dicts", 
    "json_dicts", 
    "2001-2020_recursive_dict.json"
)
token_map_load_path = os.path.join(os.path.curdir, "data", "revised_normalized_smart_dicts", "token_map")

dictionary_re = create_re_from_formatted_dictionary(FORMATTED_DICT_PATH)

token_map = TokenMap(DATA_PATH, token_map_load_path, dictionary_re=dictionary_re)

Loading token map from disk...


In [24]:
dictionary_time_periods = []

start_year = 1873
end_year = 2000


temp_year = 1873
while temp_year < end_year - 1:
    dictionary_time_periods.append((temp_year, temp_year + 19))
    temp_year += 20
dictionary_time_periods = sorted(dictionary_time_periods, reverse=True)

dictionary_time_periods[0] = (1993, 2000)
dictionary_time_periods = [(2001, 2020)] + dictionary_time_periods

print(len(dictionary_time_periods))
print(dictionary_time_periods[-10:])

8
[(2001, 2020), (1993, 2000), (1973, 1992), (1953, 1972), (1933, 1952), (1913, 1932), (1893, 1912), (1873, 1892)]


In [25]:
# Load the starting dictionaries
with open(FORMATTED_DICT_PATH, "r") as f:
    curr_dicts = json.load(f)

In [26]:
def format_data_for_gensim(data_item):
    return data_item["sentence"]

In [27]:
def calculate_average_sim_score(word, dictionary_terms, model):
    tot_sim_score = 0
    num_valid_terms = 0

    for term in dictionary_terms:
        if term != word:
            try:
                specific_score = model.wv.similarity(word, term)
                tot_sim_score += specific_score
                num_valid_terms += 1
            except KeyError as e:
                pass

    return tot_sim_score / num_valid_terms

In [28]:
def calculate_median_sim_score(word, dictionary_terms, model):
    all_sim_scores = []

    for term in dictionary_terms:
        if term != word:
            try:
                specific_score = model.wv.similarity(word, term)
                all_sim_scores.append(specific_score)
            except KeyError as e:
                passos.path.join("data", "ensamble_fine_tuning", "original_dict.json")

    all_sim_scores = sorted(all_sim_scores)
    median_score = all_sim_scores[len(all_sim_scores) // 2]
    if word in dictionary_terms:
        # Boost the score of words that are already in the dictionary
        # print(f"Boosting {token_map.get_token_from_id(word)} from {median_score} to {median_score * (1 + PREVIOUS_TERM_BOOST)} because it is already in the dictionary")
        median_score = median_score * (1 + PREVIOUS_TERM_BOOST)

    return median_score

In [29]:
def calculate_dictionary(model, dictionary_terms, previous_dictionary_size: int):
     new_term_counts = {}
     new_term_scores = {}

     for term in dictionary_terms:
          try:
               top_matching_words = model.wv.most_similar(term, topn=previous_dictionary_size)

               for (word, sim_score) in top_matching_words:
                    # Keep track of max sim score for each word 0

                    
                    if not (word in new_term_scores):
                         # Only needs to be calculated once
                         # average_sim_score = calculate_average_sim_score(word, dictionary_terms, model)
                         median_score = calculate_median_sim_score(word, dictionary_terms, model)
                         new_term_scores[word] = median_score
                    else:
                         # Boost the score of words that appear more times
                         # new_term_scores[word] = new_term_scores[word] * (1 + NUM_RECURRENCE_BOOST)
                         original_score = new_term_scores[word]
                         # median_score = calculate_median_sim_score(word, dictionary_terms, model)
                         new_term_scores[word] = original_score * (1 + NUM_RECURRENCE_BOOST)
                         

                    # Keep track of how many times a word appears in the top matching words
                    # if sim_score > DICTIONARY_SIMILARITY_THRESHOLD:
                    #      if word not in new_term_counts:
                    #           new_term_counts[word] = 0
                    #      new_term_counts[word] += 1
                         
          except KeyError as e:
               translated_term = token_map.get_token_from_id(term)
               # print(f"{translated_term}-{term}: {e}")

     # print("Previous dictionary reoccurrences:")
     # print({k: v for k, v in sorted(previous_dict_reocurrences.items(), key=lambda item: item[0])})
     # Sort dictionaries and remove words that don't appear enough times
     # new_term_counts = {k: v for k, v in sorted(new_term_counts.items(), key=lambda item: item[1], reverse=True) if v >= NUM_RECURRENCES}     
     new_term_scores = {k: v for k, v in sorted(new_term_scores.items(), key=lambda item: item[1], reverse=True)}


     # Compile the dictionary from the recurring words
     final_dictionary_terms = set()
     final_dictionary_scores = {}
     # for term in new_term_counts.keys():
     #      if len(final_dictionary_terms) >= previous_dictionary_size:
     #           break
     #      final_dictionary_terms.add(term)

     # Add words based on similarity score until the dictionary is full
     for term, score in new_term_scores.items():
          if len(final_dictionary_terms) >= previous_dictionary_size:
               break
          final_dictionary_terms.add(term)
          final_dictionary_scores[term] = score
     
     # Remove the unknown token if it is in the dictionary
     # The unkown token is meaningless
     if token_map.unkown_id in final_dictionary_terms:
          final_dictionary_terms.remove(token_map.unkown_id)
          final_dictionary_scores.pop(token_map.unkown_id)

     # print(f"Previous dictionary reocurrences: {sorted([token_map.get_token_from_id(term) for term in final_dictionary_terms if term in dictionary_terms]) }")
     # print(f"Average dictionary_sim_score for with previous boost: {sum(final_dictionary_scores.values()) / len(final_dictionary_scores)}")
     # print(f"Average score of top 25 terms: {sum(sorted(final_dictionary_scores.values(), reverse=True)[:25]) / 25}. There are {len(final_dictionary_scores)} terms in the dictionary.")

     return list(final_dictionary_terms), final_dictionary_scores

In [30]:
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score

def calculate_cluster_centroids(embeddings):
    # Find optimal number of clusters
    optimal_num_clusters = 0
    best_silhouette_score = 0
    best_cluster_labels = None

    for _ in range(NUM_CLUSTER_RERUNS):
        for num_clusters in NUM_CLUSTERS_TO_CHECK:
            kmeans = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=2000, tol=1e-5).fit(embeddings)
            # with io.capture_output() as captured:
            #     kmeans = SpectralClustering(affinity="rbf", assign_labels="kmeans", n_clusters=num_clusters, n_init=NUM_CLUSTER_RERUNS).fit(embeddings)
            # kmeans = AgglomerativeClustering(n_clusters=num_clusters, linkage="complete").fit(embeddings)
            cluster_labels = kmeans.labels_
            num_unique_labels = len(set(cluster_labels))
            
            if num_unique_labels > 1:
                num_clusters = num_unique_labels
                silhouette_avg = silhouette_score(embeddings, cluster_labels)       

                if silhouette_avg > best_silhouette_score:
                    # print(f"Number of clusters: {num_clusters} with silhouette score: {silhouette_avg}") 
                    # We want to discourge large amounts of clusters, so negatibely weight larger clusters
                    if silhouette_avg - ((max(num_clusters - optimal_num_clusters, 0)) * (best_silhouette_score*LARGE_CLUSTER_WEIGHT)) > best_silhouette_score:
                        best_silhouette_score = silhouette_avg
                        optimal_num_clusters = num_clusters
                        best_cluster_labels = cluster_labels
                        # print(f"New optimal number of clusters: {optimal_num_clusters} with silhouette score: {best_silhouette_score}")


    # print(f"Optimal number of clusters: {optimal_num_clusters} with silhouette score: {best_silhouette_score}")
    return len(set(best_cluster_labels)), best_cluster_labels


In [31]:
def calculate_cluster_distributions(cluster_labels, num_clusters):
    cluster_distributions = {}
    for i in range(num_clusters):
        cluster_distributions[i] = 0

    for label in cluster_labels:
        cluster_distributions[label] += 1

    return cluster_distributions

In [32]:
def convert_list_of_scores_to_final_scores(list_scores_dict):
    """
    Returns a list of terms translated to english sorted by their values in descending order
    """
    final_scores = {}

    total_num_recurrences = 0
    for term, score_lists in list_scores_dict.items():
        num_recurrences = len(score_lists)
        if num_recurrences > 1:
            total_num_recurrences += 1

        # Each term with multiple occurences gets a percentage boost over the average score
        # TODO: Consider using the median here instead of the average
        # if num_recurrences > 1:
        #     print(f"Term {token_map.get_token_from_id(term)} has {num_recurrences} recurrences")
    
        final_scores[term] = (sum(score_lists) / num_recurrences) * (1 + (num_recurrences * NUM_RECURRENCE_BOOST))
        
    final_dictionary = [token_map.get_token_from_id(k) for k, v in sorted(final_scores.items(), key=lambda item: item[1], reverse=True)]
    final_scores = [v for k, v in sorted(final_scores.items(), key=lambda item: item[1], reverse=True)]
    return final_dictionary, final_scores

In [33]:
def combine_ensamble_dicts(ensamble_dicts, desired_dict_sizes):
    combined_dict = {}

    for moral_foundation, dict_terms in ensamble_dicts.items():
        # Convert list of scores to final scores
        sorted_terms, sorted_scores = convert_list_of_scores_to_final_scores(dict_terms)
        final_dict_terms = sorted_terms[:desired_dict_sizes[moral_foundation]]
        final_dict_scores = sorted_scores[:desired_dict_sizes[moral_foundation]]

        print(f"Final dictionary size for {moral_foundation}: {len(final_dict_terms)}")
        print(f"Average score for {moral_foundation}: {sum(final_dict_scores) / len(final_dict_scores)}")

        combined_dict[moral_foundation] = final_dict_terms

    return combined_dict

In [34]:
for i, time_period in enumerate(dictionary_time_periods):
    term_counts_and_scores_by_dictionary = {}
    desired_dict_sizes = {}
    num_trained_models = 0
    tot_train_time = 0
    print(f"The current time period is {time_period}.  {i+1}/{len(dictionary_time_periods)}")
    p_bar = tqdm(range(NUM_ENSAMBLE_RERUNS), desc="Calculating dictionary")
    for x in p_bar:
        model_save_path = os.path.join(MODELS_SAVE_PATH, f"{x}_{time_period[0]}_{time_period[1]}.model")

        if os.path.exists(model_save_path):
            model = gensim.models.Word2Vec.load(model_save_path)
        else:
            # Load the data
            dataset = CongressDataset(token_map=token_map, date_range=time_period)
            dataset.map(format_data_for_gensim)

            # Train and save the new model
            model = gensim.models.Word2Vec(dataset, size=EMBEDDING_SIZE, min_count=WORD_MIN_COUNT, workers=12)
            model.save(model_save_path)

        for dict_name in curr_dicts.keys():
            if dict_name not in term_counts_and_scores_by_dictionary:
                term_counts_and_scores_by_dictionary[dict_name] = {}

            # grab existing terms
            dict_terms = curr_dicts[dict_name]
            desired_dict_sizes[dict_name] = len(dict_terms)
            translated_terms = [token_map.get_token_id_from_token(term) for term in dict_terms if token_map.get_token_id_from_token(term) in model.wv]

            dictionary_embeddings = [embedding for embedding in model.wv[translated_terms]]

            # Calculate the centroids of the clusters
            # num_clusters, cluster_labels = calculate_cluster_centroids(dictionary_embeddings)
            # cluster_distributions = calculate_cluster_distributions(cluster_labels, num_clusters)
            # print(cluster_distributions)
            # translated_terms_and_labels = list(zip(translated_terms, cluster_labels))

            # Calculate the new dictionary terms
            new_terms, new_term_scores = calculate_dictionary(model, translated_terms, len(dict_terms))

            for term, score in new_term_scores.items():
                if term not in term_counts_and_scores_by_dictionary[dict_name]:
                    term_counts_and_scores_by_dictionary[dict_name][term] = []
                term_counts_and_scores_by_dictionary[dict_name][term].append(score)

    curr_dicts = combine_ensamble_dicts(term_counts_and_scores_by_dictionary, desired_dict_sizes)

    # Save the new completed dictionary
    new_dict_name = f"{time_period[0]}-{time_period[1]}_recursive_dict.json"
    new_dict_path = os.path.join(DICTIONARY_SAVE_PATH, new_dict_name)
    with open(new_dict_path, "w+") as f:
        json.dump(curr_dicts, f)

The current time period is (2001, 2020).  1/8


Calculating dictionary: 100%|██████████| 20/20 [11:01<00:00, 33.05s/it]


Final dictionary size for Harm: 97
Average score for Harm: 0.6725509970279776
Final dictionary size for Authority: 159
Average score for Authority: 0.35238193153155456
Final dictionary size for Fairness: 85
Average score for Fairness: 0.6687338615622314
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 2.1073226558391864
Final dictionary size for Ingroup: 122
Average score for Ingroup: 0.3510243878094075
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 0.6107474650747837
The current time period is (1993, 2000).  2/8


Calculating dictionary: 100%|██████████| 20/20 [08:35<00:00, 25.80s/it]


Final dictionary size for Harm: 97
Average score for Harm: 1.366562112361466
Final dictionary size for Authority: 159
Average score for Authority: 1.1703353846955982
Final dictionary size for Fairness: 85
Average score for Fairness: 1.665326845509773
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 71.39386449251744
Final dictionary size for Ingroup: 122
Average score for Ingroup: 1.4486743085253013
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 2.154338559027901
The current time period is (1973, 1992).  3/8


Calculating dictionary: 100%|██████████| 20/20 [07:13<00:00, 21.69s/it]


Final dictionary size for Harm: 97
Average score for Harm: 2.053922802604986
Final dictionary size for Authority: 159
Average score for Authority: 1.9696715458679355
Final dictionary size for Fairness: 85
Average score for Fairness: 3.197324473690881
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 144.74328300572517
Final dictionary size for Ingroup: 122
Average score for Ingroup: 2.1552388369481674
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 2.299594514488236
The current time period is (1953, 1972).  4/8


Calculating dictionary: 100%|██████████| 20/20 [07:22<00:00, 22.10s/it]


Final dictionary size for Harm: 97
Average score for Harm: 2.1374943050079804
Final dictionary size for Authority: 159
Average score for Authority: 2.624858065160791
Final dictionary size for Fairness: 85
Average score for Fairness: 4.437114519310292
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 167.31718290592593
Final dictionary size for Ingroup: 122
Average score for Ingroup: 2.692563492733906
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 1.8640751595818938
The current time period is (1933, 1952).  5/8


Calculating dictionary: 100%|██████████| 20/20 [05:48<00:00, 17.43s/it]


Final dictionary size for Harm: 97
Average score for Harm: 2.5075894381692874
Final dictionary size for Authority: 159
Average score for Authority: 3.3274933114910756
Final dictionary size for Fairness: 85
Average score for Fairness: 4.7022972457260765
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 87.44002494684513
Final dictionary size for Ingroup: 122
Average score for Ingroup: 2.7069900304288392
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 2.0919637468833407
The current time period is (1913, 1932).  6/8


Calculating dictionary: 100%|██████████| 20/20 [07:00<00:00, 21.04s/it]


Final dictionary size for Harm: 97
Average score for Harm: 3.026545867613879
Final dictionary size for Authority: 159
Average score for Authority: 7.273428048182514
Final dictionary size for Fairness: 85
Average score for Fairness: 5.4240848391917975
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 150.26627637156614
Final dictionary size for Ingroup: 122
Average score for Ingroup: 5.5310901010173374
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 3.652501325398393
The current time period is (1893, 1912).  7/8


Calculating dictionary: 100%|██████████| 20/20 [05:10<00:00, 15.53s/it]


Final dictionary size for Harm: 97
Average score for Harm: 3.313316391971528
Final dictionary size for Authority: 159
Average score for Authority: 10.681123851893455
Final dictionary size for Fairness: 85
Average score for Fairness: 5.330837913587267
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 142.37346695918276
Final dictionary size for Ingroup: 122
Average score for Ingroup: 5.682691466273027
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 3.281960462689158
The current time period is (1873, 1892).  8/8


Calculating dictionary: 100%|██████████| 20/20 [05:35<00:00, 16.76s/it]

Final dictionary size for Harm: 97
Average score for Harm: 3.301530697500771
Final dictionary size for Authority: 159
Average score for Authority: 16.28558236446472
Final dictionary size for Fairness: 85
Average score for Fairness: 5.598496632137639
Final dictionary size for Institutional_Purity: 282
Average score for Institutional_Purity: 185.2805822868984
Final dictionary size for Ingroup: 122
Average score for Ingroup: 7.531977255233678
Final dictionary size for Sexual_Purity: 39
Average score for Sexual_Purity: 4.140269754010492



