# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 1: Model Traning, Calculation Centroids & Speech Scoring
### Author: Sarah Franzen

In [8]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# Prompt user to enter working directory path
wd = Path(input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip())

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

# === Define Folder Paths ===

# Folders were already created in the script 0_data_creation
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 

# Upload ressources
stopwords = joblib.load(data_c / "ind_stopwords.pkl")         
word_counts = joblib.load(data_freq / "ind_stopwords_word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "ind_stopwords_word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

### Sentence Split

In [10]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

---

### Train Word2Vec

In [14]:
w2v = Word2Vec.load(str(data_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [18]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/ind_stopwords_affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/ind_stopwords_cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [ 4.42340337e-02 -2.09172890e-01 -3.85503657e-02 -2.00820249e-02
 -1.18636310e-01 -2.92804204e-02 -1.19028106e-01 -2.18689188e-01
  1.00538477e-01  2.41437927e-01  8.94445255e-02  2.75274385e-02
  5.16583994e-02 -7.13846907e-02 -1.07583471e-01  1.75162271e-01
  9.50213671e-02 -1.02674469e-01  1.62559412e-02  5.90824373e-02
  1.47709688e-02  8.19741115e-02 -6.17422722e-02 -8.37609619e-02
  1.19066581e-01 -7.59860575e-02  1.16937488e-01 -5.83100729e-02
 -2.70521790e-02  1.37679860e-01 -8.18770677e-02  4.58782688e-02
 -1.11739866e-01 -1.24202237e-01 -7.94388577e-02  1.70388855e-02
 -1.81265399e-01 -1.20354488e-01 -6.97968155e-02  1.19308390e-01
 -1.02443174e-01 -1.38570324e-01 -1.30177122e-02 -8.08719620e-02
  2.49411389e-01 -2.11648405e-01 -3.04324254e-02 -1.22231714e-01
 -5.90274436e-03 -1.69320688e-01  3.07866558e-02  7.22128227e-02
  1.08522289e-02 -9.07894298e-02 -4.27358635e-02  4.13486399e-02
 -5.56628928e-02  1.20879196e-01 -3.89266834e-02  1.09691858e-01


---

### Emotionality Scoring

In [20]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
   joblib.load(os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed1_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed2_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed3_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed4_final.pkl'))
]

In [21]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_results, f'ind_stopwords_distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'ind_stopwords_preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(data_results, f'ind_stopwords_distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_results, 'ind_stopwords_distances_10epochs.pkl'))

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\results\\ind_stopwords_distances_10epochs.pkl']

In [22]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  ARG_01_1946.txt  1.014508     0.598817  0.703328
1  AUS_01_1946.txt  1.290420     0.570782  0.496481
2  BEL_01_1946.txt  1.223814     0.655980  0.577511
3  BLR_01_1946.txt  0.869554     0.817950  0.956343
4  BOL_01_1946.txt  0.581391     0.593964  1.008942


In [23]:
# Load df_merged and merge with tot_df by filename 
un_corpus_merged = pd.read_csv(os.path.join(data_c, "un_corpus_merged.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_merged.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(data_results, "ind_stopwords_un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_results, "ind_stopwords_un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [24]:
print(un_corpus_scored)

              filename                                             speech  \
0      ARG_01_1946.txt  At the resumption of the first session of the ...   
1      AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2      BEL_01_1946.txt  The\tprincipal organs of the United Nations ha...   
3      BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4      BOL_01_1946.txt  Coming to this platform where so many distingu...   
...                ...                                                ...   
10947  WSM_79_2024.txt  Excellencies,  \nI extend my congratulations t...   
10948  YEM_79_2024.txt  Your Majesties, Excellencies, and Highnesses, ...   
10949  ZAF_79_2024.txt  President of the 79th Session of the UN Genera...   
10950  ZMB_79_2024.txt  \n  YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...   
10951  ZWE_79_2024.txt  Your Excellency, Mr. Philemon Yang, President ...   

      country_code  year  country_name  speech_length_words  \
0           

In [25]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

print("Count where affect_d is NaN:", nan_count)

Count where affect_d is NaN: 0
