# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Additional Analysis: Different Calculation of Weighted Frequencies - Model Traning, Calculation Centroids & Speech Scoring
### Author: Sarah Franzen

### Description

In the replication package, weighted frequencies are calculated on the full preprocessed corpus (35,009 unique words; 4,500,778 tokens), while the embedding corpus drops words occurring fewer than 10 times (9,453 unique words; 4,286,666 tokens). This script examines whether calculating weighted frequencies after removing these low-frequency words makes any difference. The sentence split from the normal script is used as this does not change anything regarding the weighted frequencies.

In [4]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# Prompt user to enter working directory path
wd = Path(input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip())

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

# === Define Folder Paths ===

# Folders were already created in the script 0_data_creation
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")         
word_counts = joblib.load(data_freq / "removed_lowfreq_words_word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "removed_lowfreq_words_word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

### Sentence Split

In [7]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

---

### Train Word2Vec

In [10]:
w2v = Word2Vec.load(str(data_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [13]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/removed_lowfreq_words_affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/removed_lowfreq_words_cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [ 0.12585203 -0.02392351 -0.2271096  -0.11951966  0.02829143  0.00029546
  0.11683416 -0.10151469 -0.07923671  0.02477175 -0.13887407  0.11289882
 -0.07312468 -0.07389244  0.0187364   0.04294862 -0.11665519  0.0233175
 -0.0275991   0.1069124  -0.13994303 -0.0801814  -0.06116669  0.05079933
  0.11222421  0.18913731 -0.04710755 -0.03058767 -0.1001954   0.25786367
 -0.01796607 -0.13737877 -0.02373359  0.07157335  0.10226793 -0.01060348
  0.04115681  0.07488894 -0.12771599  0.12911698 -0.04538443 -0.18144502
 -0.12025422 -0.05127291 -0.014183   -0.07427991  0.22335163  0.17401156
 -0.00284594 -0.1044222  -0.00801853 -0.00475153 -0.15208797 -0.00408498
  0.01342905 -0.10713586  0.10932075 -0.08576442  0.0017823   0.18433638
 -0.06518261  0.09092978  0.07002422 -0.10747878  0.07531368 -0.08674558
  0.19473568 -0.24239816 -0.03207047 -0.20926622 -0.09958109  0.04387255
 -0.13151264 -0.02240443  0.02450578  0.03582343  0.17439759  0.05656255
 -0.05422827 -0.01641588  0

---

### Emotionality Scoring

In [16]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, '_preprocessed_speeches_indexed4_final.pkl')
]

In [17]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_results, f'removed_lowfreq_words_distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(data_results, f'removed_lowfreq_words_distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_results, 'removed_lowfreq_words_distances_10epochs.pkl'))

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\results\\removed_lowfreq_words_distances_10epochs.pkl']

In [18]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  ARG_01_1946.txt  0.944931     0.618719  0.763834
1  AUS_01_1946.txt  1.270542     0.607406  0.523813
2  BEL_01_1946.txt  1.179123     0.684148  0.623836
3  BLR_01_1946.txt  0.812188     0.798809  0.988862
4  BOL_01_1946.txt  0.511304     0.567441  1.039186


In [19]:
un_corpus_cleaned_final = pd.read_csv(os.path.join(data_c, "un_corpus_cleaned_final.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpu_cleaned_final.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [20]:
print(un_corpus_scored)

              filename                                             speech  \
0      ARG_01_1946.txt  At the resumption of the first session of the ...   
1      AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2      BEL_01_1946.txt  The\tprincipal organs of the United Nations ha...   
3      BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4      BOL_01_1946.txt  Coming to this platform where so many distingu...   
...                ...                                                ...   
10947  WSM_79_2024.txt  Excellencies,  \nI extend my congratulations t...   
10948  YEM_79_2024.txt  Your Majesties, Excellencies, and Highnesses, ...   
10949  ZAF_79_2024.txt  President of the 79th Session of the UN Genera...   
10950  ZMB_79_2024.txt  \n  YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...   
10951  ZWE_79_2024.txt  Your Excellency, Mr. Philemon Yang, President ...   

      country_code  year  country_name  speech_length_words  \
0           

In [21]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

print("Count where affect_d is NaN:", nan_count)

Count where affect_d is NaN: 0
