# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Additional Analysis: Different Calculation of the Weighted Frequencies
### Author: Sarah Franzen

### Description

In the replication package, weighted frequencies are calculated on the full preprocessed corpus (35,009 unique words; 4,500,778 tokens), while the embedding corpus drops words occurring fewer than 10 times (9,453 unique words; 4,286,666 tokens). This script examines whether calculating weighted frequencies after removing these low-frequency words makes any difference.ce.
rence

___

## Setup, Installation of required Packages and Libraries & Folder Structure

In [85]:
# == Import libraries for data processing and NLP ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path
from rapidfuzz import process, fuzz

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [86]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
wd = Path(input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip())

# Change to the entered working directory

#wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


In [87]:
# === Define Folder Paths ===

# If an error occurs, make sure that you actually have these folders in your working directory

data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"

In [88]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


In [89]:
df_merged = joblib.load(os.path.join(data_c, "un_corpus_merged.pkl"))

preprocessed_files = [
    os.path.join(data_preprocessed, f"preprocessed_speeches_indexed{i}.pkl") 
    for i in range(1, 5)  # adjust the range if you have more/less files
]

preprocessed_data = []
for fpath in preprocessed_files:
    data = joblib.load(fpath)
    preprocessed_data.extend(data)

___

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [93]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

# Remove Words that appear less than 10x times
removed_lowfreq_words_word_counts = count_frequencies(preprocessed_files)

removed_lowfreq_words_word_counts  = Counter({w: c for w, c in removed_lowfreq_words_word_counts.items() if c >= 10})

print("\n[Stemmed] Top 50 most common words:")
for word, count in removed_lowfreq_words_word_counts.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in removed_lowfreq_words_word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'removed_lowfreq_words_word_counts.pkl')
joblib.dump(removed_lowfreq_words_word_counts, save_path)

100%|██████████| 4/4 [00:16<00:00,  4.09s/it]


[Stemmed] Top 50 most common words:
econom: 73475
human: 57178
problem: 56809
region: 48963
achiev: 39487
global: 36179
africa: 34787
nuclear: 32246
solut: 32199
social: 30684
charter: 28158
african: 27745
weapon: 26744
contribut: 26390
respons: 26218
negoti: 25658
implement: 25093
cannot: 22646
ensur: 22450
area: 22357
disarma: 21892
increas: 21585
strengthen: 20530
promot: 20382
role: 19597
non: 19390
decis: 18832
propos: 18690
climat: 17991
threat: 17597
goal: 17426
crisi: 17155
terror: 16682
stabil: 16576
struggl: 14857
aggress: 14819
toward: 14590
palestinian: 14093
soviet: 13992
financi: 13807
poverti: 13372
europ: 13173
democraci: 13087
share: 12802
purpos: 12612
suffer: 12543
dialogu: 12385
european: 12324
popul: 12253
regim: 12026

[Stemmed] Top 50 least common words:
gridlock: 10
tiraspol: 10
niyazov: 10
kinkel: 10
herat: 10
turquois: 10
softwar: 10
overexploit: 10
débi: 10
mandab: 10
bakili: 10
rss: 10
midrand: 10
sukhumi: 10
olara: 10
highhanded: 10
mohéli: 10
habibi: 10
c




['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\removed_lowfreq_words_word_counts.pkl']

In [94]:
num_unique_words = len(removed_lowfreq_words_word_counts)
print(f"Number of unique words: {num_unique_words}")

Number of unique words: 9473


In [95]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [97]:
affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

# == Count dictionary words

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

a_list = [[i, removed_lowfreq_words_word_counts[i]] for i in affect if i in removed_lowfreq_words_word_counts]
c_list = [[i, removed_lowfreq_words_word_counts[i]] for i in cognition if i in removed_lowfreq_words_word_counts]

a_list = sorted(a_list, key=lambda x: x[1], reverse=True)
c_list = sorted(c_list, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a_list]
c = [[i[0], f"({i[1]}),"] for i in c_list]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "removed_lowfreq_words_affect_words.txt")
cog_out_path = os.path.join(data_freq, "removed_lowfreq_words_cog_words.txt")

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

# number of affect/cognitive words that appear in word_counts
num_affect_words = len(a_list)
num_cog_words = len(c_list)

print(f"Unique affect words in text: {num_affect_words}")
print(f"Unique cognition words in text: {num_cog_words}")

# == Calculate weighted frequencies for all words

l = sum(removed_lowfreq_words_word_counts.values())

a = 0.001 # Method to downweight with a smoothing parameter: For frequent words (large v/1), weight approaches 0; for rare words (small v/1) closer to 1
removed_lowfreq_words_word_counts_weighted = {k: a / (a + (v / l)) for k, v in removed_lowfreq_words_word_counts.items()}

joblib.dump(removed_lowfreq_words_word_counts_weighted, os.path.join(data_freq, 'removed_lowfreq_words_word_counts_weighted.pkl'))

# To print top 50 by weighted values, sort the dictionary by value descending:
top_50_weighted = sorted(removed_lowfreq_words_word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:50]

print("Top 50 words by weighted frequency:")
for word, weight in top_50_weighted:
    print(f"{word}: {weight}")

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

---

## Final Cleaning

In [103]:
os.chdir(data_freq)

removed_lowfreq_words_counts = joblib.load('removed_lowfreq_words_word_counts.pkl')  

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if removed_lowfreq_words_counts.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    # Extract the filename
    fname = os.path.basename(data_path)  # e.g., 'preprocessed_speeches_indexed1.pkl'
    
    # Add prefix and '_final' before '.pkl'
    cleaned_name = f"removed_lowfreq_words_{fname.replace('.pkl', '_final.pkl')}"
    
    # Full path
    cleaned_path = os.path.join(data_preprocessed, cleaned_name)
    
    joblib.dump(data, cleaned_path)

In [109]:
os.chdir(data_preprocessed)

final_files = [
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed4_final.pkl')
]

final_data = []
for fname in final_files:
    final_data.extend(joblib.load(fname))

df_final = pd.DataFrame(final_data, columns=["filename", "speech_final_removed_lowfreq_words"])
df_merged_removed_lowfreq_words = df_merged.merge(df_final, on="filename", how="left")


df_merged_removed_lowfreq_words["speech_length_final_removed_lowfreq_words"] = df_merged_removed_lowfreq_words["speech_final_removed_lowfreq_words"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(df_merged_removed_lowfreq_words[["filename", "speech_length_final_removed_lowfreq_words"]].head())

all_tokens_final = [token for speech in df_merged_removed_lowfreq_words["speech_final_removed_lowfreq_words"].dropna() for token in speech]
unique_tokens_final = set(all_tokens_final)
print("Total unique tokens across all final speeches:", len(unique_tokens_final))

print("Average tokens per final speech:", df_merged_removed_lowfreq_words["speech_length_final_removed_lowfreq_words"].mean())

# Save final merged DataFrame
joblib.dump(df_merged_removed_lowfreq_words, os.path.join(data_c, "un_corpus_merged_removed_lowfreq_words.pkl"))

          filename  speech_length_final_removed_lowfreq_words
0  ARG_01_1946.txt                                        339
1  AUS_01_1946.txt                                        360
2  BEL_01_1946.txt                                        269
3  BLR_01_1946.txt                                        380
4  BOL_01_1946.txt                                        142
Total unique tokens across all final speeches: 9473
Average tokens per final speech: 405.87783053323597


['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\un_corpus_merged_removed_lowfreq_words.pkl']

In [None]:
# Save as CSV
df_merged_removed_lowfreq_words.to_csv(
    os.path.join(data_c, "un_corpus_merged_removed_lowfreq_words.csv"),
    sep=';',
    index=False,
    encoding='utf-8'
)