In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import csv
import json
import pandas as pd
import openpyxl

In [24]:
def load_analysis_data_for_moral_foundation(path_to_analysis_csvs):
    analysis_data_paths = [(os.path.join(path_to_analysis_csvs, f), f.replace(".xlsx", "")) for f in os.listdir(path_to_analysis_csvs) if f.endswith(".xlsx")]

    analysis_data = {"time_period": [], "time_period_id": [], "dictionary_size": [], "num_changed": [], "prop_changed": [], "moral_foundation": []}

    for data_path, moral_foundation in analysis_data_paths:
        # with openpyxl.load_workbook(data_path) as wb:
        wb = openpyxl.load_workbook(data_path)
        ws = wb.active
        reader = ws.iter_rows(values_only=True)

        # Get rid of headers
        next(reader)
        next(reader)
        next(reader)
        next(reader)
        
        period_id = 0
        for row in reader:
            dict_size = int(row[1])
            num_changed = dict_size - int(row[2])
            time_period = row[0]
            time_period = time_period.split("-")
            time_period = time_period[1] + "-" + time_period[0]
            analysis_data["time_period"].append(time_period)
            analysis_data["time_period_id"].append(period_id)
            analysis_data["dictionary_size"].append(dict_size)
            analysis_data["num_changed"].append(num_changed)
            analysis_data["prop_changed"].append(num_changed / dict_size)
            analysis_data["moral_foundation"].append(moral_foundation)
            period_id += 1

    return analysis_data

In [25]:
DICT_ID = "no_wiki_continuous_v1"
DATA_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "csv_analysis")
SAVE_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "csv_analysis", "prop_changed.jpg")
analysis_data = load_analysis_data_for_moral_foundation(DATA_PATH)
df = pd.DataFrame(analysis_data)


In [26]:
# Plot the data
sns.set_style("darkgrid")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)

sns.set_style("darkgrid")
sns.set_context("paper")

sns.lineplot(x="time_period", y="prop_changed", hue="moral_foundation", data=df, ax=ax,)
ax.set_title("Proportion of Terms Changed in Each Dictionary Over Time")
ax.set_xlabel("Time Period")
ax.set_ylabel("Proportion of Terms Changed")
ax.legend(loc="upper right", title="Moral Foundations")

plt.savefig(SAVE_PATH, dpi=600)
# plt.plot()
plt.close()

In [27]:
# Make stacked bar chart
ORIG_DICT = os.path.join(
    os.path.curdir,
    "data",
    DICT_ID, 
    "hand_curated", 
    "formatted_hand_curated_dict.json"
)

DYNAMIC_DICT_PATHS = os.path.join(os.path.curdir, "data", DICT_ID, "json_dicts")

In [28]:
dictionary_time_periods = []

start_year = 1873
end_year = 2000


temp_year = 1873
while temp_year < end_year - 1:
    dictionary_time_periods.append((temp_year, temp_year + 19))
    temp_year += 20
dictionary_time_periods = sorted(dictionary_time_periods, reverse=True)

dictionary_time_periods[0] = (1993, 2000)
dictionary_time_periods = [(2001, 2020)] + dictionary_time_periods

print(len(dictionary_time_periods))
print(dictionary_time_periods[-10:])

8
[(2001, 2020), (1993, 2000), (1973, 1992), (1953, 1972), (1933, 1952), (1913, 1932), (1893, 1912), (1873, 1892)]


In [29]:
words_mapped_to_time_periods = {}

with open(ORIG_DICT, "r") as f:
    orig_dict = json.load(f)

    for moral_foundation, words in orig_dict.items():
        if moral_foundation not in words_mapped_to_time_periods:
            words_mapped_to_time_periods[moral_foundation] = {}
        for word in words:
            words_mapped_to_time_periods[moral_foundation][word] = "Hand Curated"

dynamic_dict_paths = [(os.path.join(DYNAMIC_DICT_PATHS, f), f[:9]) for f in os.listdir(DYNAMIC_DICT_PATHS) if f.endswith(".json")]
dynamic_dict_paths = sorted(dynamic_dict_paths, key=lambda x: x[1], reverse=True)
for dynamic_dict_path, time_period  in dynamic_dict_paths:
    formatted_time_period = time_period.split("-")
    formatted_time_period = formatted_time_period[1] + "-" + formatted_time_period[0]
    with open(dynamic_dict_path, "r") as f:
        dynamic_dict = json.load(f)

        for moral_foundation, words in dynamic_dict.items():
            for word in words:
                if word not in words_mapped_to_time_periods[moral_foundation]:
                    words_mapped_to_time_periods[moral_foundation][word] = formatted_time_period

In [30]:
# Create graph of all dictionaries over time
total_counts_by_time_period = {}
for time_period in dictionary_time_periods:
    str_time_period = str(time_period[1]) + "-" + str(time_period[0])
    total_counts_by_time_period[str_time_period] = {}
    for time_period2 in dictionary_time_periods:
        str_time_period2 = str(time_period2[1]) + "-" + str(time_period2[0])
        total_counts_by_time_period[str_time_period][str_time_period2] = 0
    total_counts_by_time_period[str_time_period]["Hand Curated"] = 0

In [31]:
for dynamic_dict_path, time_period  in dynamic_dict_paths:
    formatted_time_period = time_period.split("-")
    formatted_time_period = formatted_time_period[1] + "-" + formatted_time_period[0]
    with open(dynamic_dict_path, "r") as f:
            
        dynamic_dict = json.load(f)

        for moral_foundation, words in dynamic_dict.items():
            for word in words:
                word_origin = words_mapped_to_time_periods[moral_foundation][word]
                total_counts_by_time_period[formatted_time_period][word_origin] += 1

In [32]:
# Convert to dataframe
df_compatible_data = {"time_period": [], "word_origin": [], "count": []}

for time_period, word_origins in total_counts_by_time_period.items():
    for word_origin, count in word_origins.items():
        df_compatible_data["time_period"].append(time_period)
        df_compatible_data["word_origin"].append(word_origin)
        df_compatible_data["count"].append(count)

tot_counts_df = pd.DataFrame(df_compatible_data)

In [33]:
# Plot the data
sns.set_style("darkgrid")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)

sns.set_style("darkgrid")
sns.set_context("paper")
hue_order = ["Hand Curated", "2020-2001", "2000-1993", "1992-1973", "1972-1953", "1952-1933", "1932-1913", "1912-1893", "1892-1873"]
sns.barplot(x="time_period", y="count", hue="word_origin", hue_order=hue_order, data=tot_counts_df, ax=ax,)
ax.set_title("Origin of Dynamically Generated Dictionary Terms")
ax.set_xlabel("Dynamically Generated Dictionary")
ax.set_ylabel("Number of Dictionary Terms")
ax.legend(loc="upper right", title="Term Origin")

SAVE_PATH = os.path.join(os.path.curdir, "data", DICT_ID, "csv_analysis", "overall_term_origin.jpg")
plt.savefig(SAVE_PATH, dpi=600)
# plt.plot()
plt.close()