# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Additional Analysis: Different Calculation of the Weighted Frequencies - Data Creation

### Description

In the replication package, weighted frequencies are calculated on the full preprocessed corpus (35,009 unique words; 4,500,778 tokens), while the embedding corpus drops words occurring fewer than 10 times (9,453 unique words; 4,286,666 tokens). This script examines whether calculating weighted frequencies after removing these low-frequency words makes any differenceThe preprocessed files from the normal script are used as the different calculation of the weighted frequencies is based on those files and hence there are no changes before.ce

___

## Setup, Installation of required Packages and Libraries & Folder Structure

In [30]:
# == Import libraries for data processing and NLP ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path
from rapidfuzz import process, fuzz

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [39]:
# === Set Working Directory ===

# --- Set base path to project root ---
base_path = Path.cwd().parents[2]  # project root
print(f"Project root set to: {base_path}")

# --- Paths ---
data_c = base_path / "data"

# === Define Folder Paths ===

data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"

Project root set to: C:\Users\sarah\Downloads


In [41]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\Downloads\TESTRUN\data


In [43]:
preprocessed_files = [
    os.path.join(data_preprocessed, f"preprocessed_speeches_indexed{i}.pkl") 
    for i in range(1, 5) 
]

preprocessed_data = []
for fpath in preprocessed_files:
    data = joblib.load(fpath)
    preprocessed_data.extend(data)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\sarah\\Downloads\\data\\preprocessed\\preprocessed_speeches_indexed1.pkl'

___

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [None]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

# Remove Words that appear less than 10x times
removed_lowfreq_words_word_counts = count_frequencies(preprocessed_files)

removed_lowfreq_words_word_counts  = Counter({w: c for w, c in removed_lowfreq_words_word_counts.items() if c >= 10})

print("\n[Stemmed] Top 50 most common words:")
for word, count in removed_lowfreq_words_word_counts.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in removed_lowfreq_words_word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'removed_lowfreq_words_word_counts.pkl')
joblib.dump(removed_lowfreq_words_word_counts, save_path)

In [None]:
num_unique_words = len(removed_lowfreq_words_word_counts)
print(f"Number of unique words: {num_unique_words}")

In [None]:
os.chdir(data_c)
print(os.getcwd())

### Count the frequency of the dictionary words

In [None]:
affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

# == Count dictionary words

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

a_list = [[i, removed_lowfreq_words_word_counts[i]] for i in affect if i in removed_lowfreq_words_word_counts]
c_list = [[i, removed_lowfreq_words_word_counts[i]] for i in cognition if i in removed_lowfreq_words_word_counts]

a_list = sorted(a_list, key=lambda x: x[1], reverse=True)
c_list = sorted(c_list, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a_list]
c = [[i[0], f"({i[1]}),"] for i in c_list]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "removed_lowfreq_words_affect_words.txt")
cog_out_path = os.path.join(data_freq, "removed_lowfreq_words_cog_words.txt")

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

# number of affect/cognitive words that appear in word_counts
num_affect_words = len(a_list)
num_cog_words = len(c_list)

print(f"Unique affect words in text: {num_affect_words}")
print(f"Unique cognition words in text: {num_cog_words}")

In [None]:
# == Calculate weighted frequencies for all words

l = sum(removed_lowfreq_words_word_counts.values())

a = 0.001 # Method to downweight with a smoothing parameter: For frequent words (large v/1), weight approaches 0; for rare words (small v/1) closer to 1
removed_lowfreq_words_word_counts_weighted = {k: a / (a + (v / l)) for k, v in removed_lowfreq_words_word_counts.items()}

joblib.dump(removed_lowfreq_words_word_counts_weighted, os.path.join(data_freq, 'removed_lowfreq_words_word_counts_weighted.pkl'))

# To print top 50 by weighted values, sort the dictionary by value descending:
top_50_weighted = sorted(removed_lowfreq_words_word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:50]

print("Top 50 words by weighted frequency:")
for word, weight in top_50_weighted:
    print(f"{word}: {weight}")