# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 1: Model Traning, Calculation Centroids & Speech Scoring
### Author: Sarah Franzen

In [1]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# Prompt user to enter working directory path
wd = Path(input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip())

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

# === Define Folder Paths ===

# Folders were already created in the script 0_data_creation
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")         
word_counts = joblib.load(data_freq / "removed_lowfreq_words_word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "removed_lowfreq_words_word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

### Sentence Split

In [4]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

---

### Train Word2Vec

In [8]:
w2v = Word2Vec.load(str(data_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [12]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/removed_lowfreq_words_affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/removed_lowfreq_words_cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [ 0.04098465 -0.20014092 -0.03026764 -0.00716365 -0.10430026 -0.02335468
 -0.1111676  -0.20361476  0.08356902  0.22238363  0.07749434  0.02523689
  0.037342   -0.0744079  -0.11382581  0.16497582  0.08343158 -0.09785923
  0.01826118  0.0551446   0.01376214  0.07867698 -0.06524096 -0.07510424
  0.11823087 -0.07123741  0.10032862 -0.0591263  -0.02780137  0.12810089
 -0.07902613  0.0348121  -0.09730709 -0.11689087 -0.07616834  0.01455656
 -0.16822559 -0.11179098 -0.05449464  0.11942889 -0.10697855 -0.11918097
 -0.01142666 -0.07971946  0.23309235 -0.19611713 -0.03424549 -0.11344792
 -0.01163713 -0.14985465  0.03287108  0.07040888  0.00893495 -0.08395618
 -0.04318048  0.0518188  -0.04054963  0.11539264 -0.05022069  0.09967262
  0.07365515 -0.18962361 -0.18084814 -0.07160484 -0.12425017 -0.11308578
 -0.0224801   0.14499588 -0.08023033  0.05781999 -0.05727304  0.17754196
 -0.01714146 -0.01159203  0.09880207 -0.0307006   0.05839084 -0.16276145
  0.10359225  0.02003118  

---

### Emotionality Scoring

In [23]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'removed_lowfreq_words_preprocessed_speeches_indexed4_final.pkl')
]

In [25]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_results, f'removed_lowfreq_words_distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'removed_lowfreq_words_preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(data_results, f'removed_lowfreq_words_distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_results, 'removed_lowfreq_words_distances_10epochs.pkl'))

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\results\\removed_lowfreq_words_distances_10epochs.pkl']

In [26]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  ARG_01_1946.txt  0.948729     0.628930  0.766752
1  AUS_01_1946.txt  1.261936     0.597479  0.526241
2  BEL_01_1946.txt  1.164039     0.683792  0.635129
3  BLR_01_1946.txt  0.809221     0.828964  1.016859
4  BOL_01_1946.txt  0.506093     0.577558  1.050241


In [33]:
# Load df_merged and merge with tot_df by filename 
un_corpus_merged = pd.read_csv(os.path.join(data_c, "un_corpus_merged_removed_lowfreq_words.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_merged.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [None]:
print(un_corpus_scored)

In [None]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

print("Count where affect_d is NaN:", nan_count)