In [6]:
import os
import json

import xlsxwriter

In [7]:
DICT_ID = "no_wiki_continuous_v1"
PATH_TO_JSON_DICTS = os.path.join("data", DICT_ID, "json_dicts")
PATH_TO_ORIGINAL_DICT = os.path.join(
    os.path.curdir,
    "data",
    DICT_ID, 
    "hand_curated", 
    "formatted_hand_curated_dict.json"
)
ANALYSIS_SAVE_PATH = os.path.join("data", DICT_ID, "csv_analysis")

ADDED_WORD_COLOR = "green"
REMOVED_WORD_COLOR = "red"
NEW_WORD_COLOR = "blue"

In [8]:
def load_json_dicts(path_to_json_dicts):
    all_dictionaries = {}

    json_dict_file_paths = [(os.path.join(path_to_json_dicts, filename), filename) for filename in os.listdir(path_to_json_dicts) if filename.endswith(".json")]
    for file_path, file_name in json_dict_file_paths:
        # Extract time period from file name
        time_period = file_name[:9]
        time_period = time_period.split("-")
        time_period = (int(time_period[0]), int(time_period[1]))

        # Load and store dictionary
        with open(file_path, "r") as f:
            json_dict = json.load(f)

            for key, value in json_dict.items():
                if key not in all_dictionaries:
                    all_dictionaries[key] = {}
                all_dictionaries[key][time_period] = value

    return all_dictionaries

In [9]:
def find_new_added_and_removed_words_from_dict_type(time_period_dictionaries: dict, original_dict: list):
    words_seen = set()
    analysed_results = {}

    # Sort dictionaries by time period
    sorted_dictionaries = {k: v for k, v in sorted(time_period_dictionaries.items(), key=lambda item: item[0][0], reverse=True)}

    original_set = set(original_dict)
    words_seen = words_seen.union(original_set)
    previous_dictionary = original_set
    for time_period, dictionary_terms in sorted_dictionaries.items():
        terms_as_set = set(dictionary_terms)
        if len(words_seen) == 0:
            # Frist Dictionary gets loaded not analyzed
            words_seen = words_seen.union(terms_as_set)
            previous_dictionary = terms_as_set
            continue

        words_added = set()
        new_words = set()
        same_words = set()
        for term in dictionary_terms:
            if (term not in words_seen) and (term not in previous_dictionary):
                new_words.add(term)
            elif term not in previous_dictionary:
                words_added.add(term)
            else:
                same_words.add(term)
        

        words_removed = set()
        for term in previous_dictionary:
            if term not in terms_as_set:
                words_removed.add(term)

        # Update previous dictionary and words seen
        previous_dictionary = terms_as_set
        words_seen = words_seen.union(terms_as_set)

        # Store analysis results
        analysed_results[time_period] = {"words_added": words_added, "words_removed": words_removed, "new_words": new_words, "same_words": same_words}

    return analysed_results



In [10]:
# Load dictionaries
all_dictionaries = load_json_dicts(PATH_TO_JSON_DICTS)

os.makedirs(ANALYSIS_SAVE_PATH, exist_ok=True)

with open(PATH_TO_ORIGINAL_DICT, "r") as f:
    original_dict = json.load(f)

for moral_foundation, time_period_dicts in all_dictionaries.items():
    # Find new words and words removed
    analysed_results = find_new_added_and_removed_words_from_dict_type(time_period_dicts, original_dict[moral_foundation])
    analysed_results = {k: v for k, v in sorted(analysed_results.items(), key=lambda item: item[0][0], reverse=True)}

    # Save results to excel file
    workbook_path = os.path.join(ANALYSIS_SAVE_PATH, f"{moral_foundation}.xlsx")
    if os.path.exists(workbook_path):
        # Xlsxwriter can't overwrite existing files
        os.remove(workbook_path)
    workbook = xlsxwriter.Workbook(os.path.join(ANALYSIS_SAVE_PATH, f"{moral_foundation}.xlsx"))
    print(f"Saving {moral_foundation} to {os.path.join(ANALYSIS_SAVE_PATH, f'{moral_foundation}.xlsx')}")
    new_words_format = workbook.add_format({'bold': True, 'font_color': NEW_WORD_COLOR})
    words_added_format = workbook.add_format({'bold': True, 'font_color': ADDED_WORD_COLOR})
    words_removed_format = workbook.add_format({'bold': True, 'font_color': REMOVED_WORD_COLOR})
    worksheet = workbook.add_worksheet()

    # Write headers
    worksheet.write(0, 0, f"New Words: {NEW_WORD_COLOR}", new_words_format)
    worksheet.write(1, 0, f"Words Added From Previous Dictionary: {ADDED_WORD_COLOR}", words_added_format)
    worksheet.write(2, 0, f"Words Removed From Previous Dictionary: {REMOVED_WORD_COLOR}", words_removed_format)
    worksheet.write(3, 0, "Time Period")
    worksheet.write(3, 1, "Dictionary Size")
    worksheet.write(3, 2, "Num Same")
    worksheet.write(3, 3, "Num New")
    worksheet.write(3, 4, "Num Added")
    worksheet.write(3, 5, "Num Removed")
    worksheet.write(3, 6, "Color Coded Words")

    curr_row = 4
    for time_period, analysis_results in analysed_results.items():
        curr_col = 0

        new_words = sorted(list(analysis_results["new_words"]))
        words_added = sorted(list(analysis_results["words_added"]))
        words_removed = sorted(list(analysis_results["words_removed"]))
        same_words = sorted(list(analysis_results["same_words"]))
        dict_size = len(new_words) + len(words_added) + len(same_words)

        # Write time period
        worksheet.write(curr_row, curr_col, f"{time_period[0]}-{time_period[1]}")
        curr_col += 1
        worksheet.write(curr_row, curr_col, dict_size)
        curr_col += 1
        worksheet.write(curr_row, curr_col, len(same_words))
        curr_col += 1
        worksheet.write(curr_row, curr_col, len(new_words))
        curr_col += 1
        worksheet.write(curr_row, curr_col, len(words_added))
        curr_col += 1
        worksheet.write(curr_row, curr_col, len(words_removed))
        curr_col += 1

        # Add color coded words
        for word in new_words:
            worksheet.write(curr_row, curr_col, word, new_words_format)
            curr_col += 1

        for word in words_added:
            worksheet.write(curr_row, curr_col, word, words_added_format)
            curr_col += 1

        for word in words_removed:
            worksheet.write(curr_row, curr_col, word, words_removed_format)
            curr_col += 1

        curr_row += 1
    
    workbook.close()

        

Saving Harm to data/no_wiki_continuous_v1/csv_analysis/Harm.xlsx
Saving Institutional_Purity to data/no_wiki_continuous_v1/csv_analysis/Institutional_Purity.xlsx
Saving Authority to data/no_wiki_continuous_v1/csv_analysis/Authority.xlsx
Saving Fairness to data/no_wiki_continuous_v1/csv_analysis/Fairness.xlsx
Saving Ingroup to data/no_wiki_continuous_v1/csv_analysis/Ingroup.xlsx
Saving Sexual_Purity to data/no_wiki_continuous_v1/csv_analysis/Sexual_Purity.xlsx
