# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 1: Model Traning, Calculation Centroids & Speech Scoring
### Author: Sarah Franzen

In [41]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# Prompt user to enter working directory path
wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

# === Define Folder Paths ===

# Folders were already created in the script 0_data_creation
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")         
word_counts = joblib.load(data_freq / "word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Folder checked/created: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\results
Folder checked/created: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\models


In [42]:
os.chdir(data_temp)
cleaned_files = [
    str(data_temp / 'clean_speeches_indexed1.pkl'),
    str(data_temp / 'clean_speeches_indexed2.pkl'),
    str(data_temp / 'clean_speeches_indexed3.pkl'),
    str(data_temp / 'clean_speeches_indexed4.pkl')
]

___

### Sentence Split

In [45]:
os.chdir(data_sent)

# Function to split cleaned speeches (clean_speeches) into sentences, tokenize, clean, tag, stem, filter, and save them.

def extract_sentences(dataname):
    data = joblib.load(dataname)
    data = [a[1] for a in data]  # keep only text, no id

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc)  # use nltk's sent_tokenize here

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if word_counts[a] >= 10] for s in sentences]

    dropped_count = sum(1 for s in sentences if len(s) <= 1)
    print(f"Number of very short sentences being dropped: {dropped_count}")

    sentences = [s for s in sentences if len(s) > 1]
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    
    unique_tokens = set(token for s in sentences for token in s)
    print(f"Unique tokens: {len(unique_tokens)}")

    # Print preview of first 5 processed sentences
    print("Example sentences (first 5):")
    for s in sentences[:5]:
        print(s)


    print(f'{lab} saved')

for fname in cleaned_files:
    extract_sentences(fname)

Number of very short sentences being dropped: 4928
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl processed
Unique tokens: 12289
Example sentences (first 5):
['thousand', 'class', 'peasant', 'middl', 'class', 'famili', 'held', 'yoke', 'poverti', 'result', 'labour', 'black', 'list']
['technic', 'cooper', 'assist', 'activ', 'develop', 'crimin', 'justic', 'capac', 'develop', 'accord', 'high', 'prioriti']
['conclud', 'share', 'tradit', 'bless', 'mauri', 'tabomoa']
['overwhelm', 'concern', 'entir', 'world', 'prevent', 'nuclear', 'war', 'accompani', 'threat', 'total', 'annihil']
['decis', 'taken', 'object', 'halt', 'action', 'govern', 'cuba', 'known', 'design', 'promot', 'financ', 'direct', 'subvers', 'movement', 'latin', 'american']
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed1.pkl saved
Number of very short sentences being dropped: 4990
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl pr

In [46]:
# Ppick the first file to see how the sentence split looks like
file_path = os.path.join(data_temp, 'sentences_indexed1.pkl')

sentences = joblib.load(file_path)

print("Example sentences (first 5):")
for s in sentences[:5]:
    print(s)

Example sentences (first 5):
['thousand', 'class', 'peasant', 'middl', 'class', 'famili', 'held', 'yoke', 'poverti', 'result', 'labour', 'black', 'list']
['technic', 'cooper', 'assist', 'activ', 'develop', 'crimin', 'justic', 'capac', 'develop', 'accord', 'high', 'prioriti']
['conclud', 'share', 'tradit', 'bless', 'mauri', 'tabomoa']
['overwhelm', 'concern', 'entir', 'world', 'prevent', 'nuclear', 'war', 'accompani', 'threat', 'total', 'annihil']
['decis', 'taken', 'object', 'halt', 'action', 'govern', 'cuba', 'known', 'design', 'promot', 'financ', 'direct', 'subvers', 'movement', 'latin', 'american']


In [47]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

In [48]:
# == Get sum of unique tokens ==
all_unique_tokens = set()

for dataname in sentences_files:
    data = joblib.load(dataname) 
    for sentence in data:
        all_unique_tokens.update(sentence) 

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")

sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

Total unique tokens across all files: 12480


---

### Train Word2Vec

In [51]:
dataset = []

for dataname in sentences_files:  
    data = joblib.load(dataname)
    dataset.extend(data) 

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    
    vector_size=300,      # Dimension of the vector
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=8,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10,            # Number of iterations over the corpus
    seeds=12
)


w2v.wv.fill_norms() 

# Save model
wd_models.mkdir(parents=True, exist_ok=True) 
w2v.save(str(wd_models / 'w2v-vectors_8_300.pkl'))

In [52]:
w2v = Word2Vec.load(str(wd_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [55]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [-0.0736924  -0.02615644  0.08641668  0.11732437  0.1528423   0.24897972
  0.14337324  0.23880306  0.03186733 -0.30849063  0.15468785 -0.06546544
 -0.04732803 -0.04446076 -0.22914626  0.13254197  0.23568878 -0.2516948
  0.27152127  0.1225189  -0.17335808  0.18330842  0.09306473  0.15133058
  0.7069783  -0.1064485  -0.2377337   0.10650271  0.07058039 -0.02874511
 -0.1373579  -0.00215214 -0.2274221   0.20105839  0.19398606  0.2080281
 -0.1449763  -0.32416582  0.27860564 -0.10978469 -0.05237249  0.16225633
  0.30137467  0.01800226  0.07078476 -0.05441128 -0.14705361 -0.11145094
 -0.01225633  0.27423495 -0.01179702  0.10230152 -0.01881051  0.07042564
  0.06828374  0.20510022  0.33411083  0.14199427  0.2653098   0.01785835
 -0.08340826  0.18939826  0.0608794  -0.0969384   0.29247656  0.1895983
 -0.14109848  0.1671956   0.2214419   0.16436344 -0.14708567  0.05495733
  0.06584255 -0.1595384   0.25399894 -0.15582773 -0.21654704 -0.00891316
 -0.07640563  0.0519471  -0.1

---

### Emotionality Scoring

In [58]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl'))
]

In [59]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(wd_results, f'distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(wd_results, f'distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(wd_results, 'distances_10epochs.pkl'))

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\results\\distances_10epochs.pkl']

In [60]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  HTI_70_2015.txt  1.364583     1.170656  0.766168
1  PRY_58_2003.txt  0.784654     0.861974  1.067943
2  GMB_72_2017.txt  1.183515     1.094364  0.901559
3  SLV_04_1949.txt  1.309417     0.809819  0.580234
4  LBY_56_2001.txt  1.131063     0.935927  0.816614


In [61]:
# Load df_merged and merge with tot_df by filename 
un_corpus_merged = pd.read_csv(os.path.join(data_c, "un_corpus_merged.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_merged.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(wd_results, "un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(wd_results, "un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [62]:
print(un_corpus_scored)

              filename                                             speech  \
0      HTI_70_2015.txt  Mr. President, I would like to express my warm...   
1      PRY_58_2003.txt  ﻿Two hundred years after the first cry of free...   
2      GMB_72_2017.txt  With warm greetings to all members of the Gene...   
3      SLV_04_1949.txt  Mr. Castro stated that the election of General...   
4      LBY_56_2001.txt  ﻿At the\noutset, I would like to congratulate ...   
...                ...                                                ...   
10947  BIH_73_2018.txt  It is my honour to address the Assembly for th...   
10948  AFG_36_1981.txt  On behalf of the delegation of the Democratic ...   
10949  SDN_43_1988.txt  ﻿It gives me great pleasure to extend to you. ...   
10950  IRN_45_1990.txt  I should like to express my sincere congratula...   
10951  CHL_29_1974.txt  The delegation of Chile participates in this i...   

      country_code  year            country_name  speech_length_words  \
0 

In [63]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

print("Count where affect_d is NaN:", nan_count)

Count where affect_d is NaN: 0
Count where affect_d is not NaN: 10952
