# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 1: Model Traning, Calculation Centroids & Speech Scoring

Ensure that script 0_data_creation ran beforehand successfully.

### Description:
- Uses `clean_speeches_indexed` for sentence split (function repeats preprocessing but does not work on tokenized data; therefore repetition of the preprocessing)
- Train Word2Vec model
- Create centroids by multiplying weighted frequency with the vectors (one for affect dictionary list, the other for cognition)
- Score speeches (repeat on doc level) and then compute cosine similarity for emotionality divided by cosine distance to rationality centroid
- Store everything as `un_corpus_scored`
ored

In [2]:
# == Import libraries ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# == Initialize NLP Tools ==

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# == Set Working Directory ==

# --- Set base path to project root ---
base_path = Path.cwd().parent  # project root
print(f"Project root set to: {base_path}")

# == Define Folder Paths ==

# Folders were already created in the script 0_data_creation
data_c = base_path / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 
data_stopwords = data_c / "stopwords"

# Load ressources
stopwords = joblib.load(data_stopwords / "stopwords_procedural_words.pkl")         
word_counts = joblib.load(data_freq / "word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Project root set to: C:\Users\sarah\Downloads\TESTRUN


In [3]:
os.chdir(data_temp)
cleaned_files = [
    str(data_temp / 'clean_speeches_indexed1.pkl'),
    str(data_temp / 'clean_speeches_indexed2.pkl'),
    str(data_temp / 'clean_speeches_indexed3.pkl'),
    str(data_temp / 'clean_speeches_indexed4.pkl')
]

___

### Sentence Split

In [6]:
os.chdir(data_temp)

# Function to split cleaned speeches (clean_speeches) into sentences, tokenize, clean, tag, stem, filter, and save them.

def extract_sentences(dataname):
    """
    Preprocess speeches into tokenized, cleaned sentences.

    Steps:
    - Load speeches and split into sentences
    - Remove very short sentences and digits, lowercase, POS-tag, stem, remove stopwords
    - Keep words with frequency >= 10
    - Save to a new file
    """
    
    data = joblib.load(dataname)
    data = [a[1] for a in data] 

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc) 

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if word_counts[a] >= 10] for s in sentences]

    dropped_count = sum(1 for s in sentences if len(s) <= 1)
    print(f"Number of very short sentences being dropped: {dropped_count}")

    sentences = [s for s in sentences if len(s) > 1]
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    
    unique_tokens = set(token for s in sentences for token in s)
    print(f"Unique tokens: {len(unique_tokens)}")

    # Print preview of first 5 processed sentences
    print("Example sentences (first 5):")
    for s in sentences[:5]:
        print(s)


    print(f'{lab} saved')

for fname in cleaned_files:
    extract_sentences(fname)

Number of very short sentences being dropped: 86798
C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl processed
Unique tokens: 8157
Example sentences (first 5):
['choos', 'hostil']
['outstand', 'alli', 'charter']
['accumul', 'enorm', 'multilater']
['aspect', 'scene', 'viet', 'nam']
['colombia', 'pleas', 'expert', 'expert', 'colombia', 'likewis', 'colombian', 'connexion', 'invest']
C:\Users\sarah\Downloads\TESTRUN\data\temp\sentences_indexed1.pkl saved
Number of very short sentences being dropped: 65694
C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed2.pkl processed
Unique tokens: 8613
Example sentences (first 5):
['children', 'celebr', 'anniversari', 'demonstr', 'multilater', 'contribut']
['forum', 'sprung', 'spontan', 'generat']
['afghanistan', 'kampuchean', 'outright', 'interfer', 'kampuchea']
['nigeria', 'chad']
['cold', 'certainti', 'instabl', 'uneas', 'uncertainti', 'endeavour']
C:\Users\sarah\Downloads\TESTRUN\data\temp\sentences_indexed2.pkl

In [7]:
# Pick the first file to see how the sentence split looks like
file_path = os.path.join(data_temp, 'sentences_indexed1.pkl')

sentences = joblib.load(file_path)

print("Example sentences (first 5):")
for s in sentences[:5]:
    print(s)

Example sentences (first 5):
['choos', 'hostil']
['outstand', 'alli', 'charter']
['accumul', 'enorm', 'multilater']
['aspect', 'scene', 'viet', 'nam']
['colombia', 'pleas', 'expert', 'expert', 'colombia', 'likewis', 'colombian', 'connexion', 'invest']


In [8]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

In [9]:
# == Get sum of unique tokens ==
all_unique_tokens = set()
total_token_count = 0

for dataname in sentences_files:
    data = joblib.load(dataname)
    for sentence in data:
        all_unique_tokens.update(sentence)
        total_token_count += len(sentence)

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")
print(f"Total number of tokens: {total_token_count}")

sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

Total unique tokens across all files: 9453
Total number of tokens): 4286666


---

### Train Word2Vec

In [71]:
os.chdir(data_temp)

In [72]:
dataset = []

for dataname in sentences_files:  
    data = joblib.load(dataname)
    dataset.extend(data) 

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    
    vector_size=300,      # Dimension of the vector
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=1,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10,            # Number of iterations over the corpus
    seed = 100
)


w2v.wv.fill_norms() 

# Save model
data_models.mkdir(parents=True, exist_ok=True) 
w2v.save(str(data_models / 'w2v-vectors_8_300.pkl'))

In [73]:
w2v = Word2Vec.load(str(data_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [76]:
# == Calculation ==
def findcentroid(text, model):
    """
    Compute the weighted average vector (centroid) of words in a list using Word2Vec.

    - Each word vector is multiplied by its frequency weight and divided by the number of dictionary words (only the ones that appear in the text)
    - Returns the mean vector for emotionality and rationality
    """
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [ 0.13069108 -0.04834932 -0.15180138 -0.08613192 -0.08944192 -0.10178063
  0.04531904 -0.03997415 -0.00702028  0.03948879 -0.12313504  0.1284162
 -0.09471979 -0.15128885 -0.01847997  0.02626847 -0.02801008 -0.015148
 -0.03062634  0.15429519 -0.08757771 -0.1254819  -0.00191349 -0.01670575
 -0.04033746  0.05174775 -0.07018315 -0.0884062  -0.11628897  0.27297094
 -0.02634064 -0.0077189  -0.02253625  0.0705168   0.1590486  -0.11155611
 -0.04112092  0.0123024  -0.12068146  0.15205382  0.02102724 -0.09592358
 -0.08436386  0.03344292 -0.09093704 -0.06677414  0.18301351  0.10604291
 -0.01274419 -0.02583208 -0.03017285  0.02446513 -0.16391745  0.00725619
  0.1008782  -0.0144089   0.18961903 -0.1391073  -0.0154625   0.29413262
 -0.11091603  0.00592328  0.05586404 -0.1499145   0.04411521 -0.07590582
  0.19943576 -0.23402527 -0.02421452 -0.13617899 -0.16322477  0.03454233
  0.03855813  0.01439126  0.04541036 -0.00934297  0.10430469 -0.02472103
 -0.03293908  0.03978493  0.1

---

### Emotionality Scoring

In [79]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl'))
]

In [80]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
     """
    Compute weighted document vectors
    Compute their distances (cosine similarity) to affect and cognition centroids
    Compute emotionality score 
    - Compute an emotionality score as (1 + 1 - affect_dist) / (1 + 1 - cognition_dist)
    - Returns a list of [doc_id, affect_distance, cognition_distance, score].
    """

    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_results, f'distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(data_results, f'distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_results, 'distances_10epochs.pkl'))

['C:\\Users\\sarah\\Downloads\\TESTRUN\\data\\results\\distances_10epochs.pkl']

In [81]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  ARG_01_1946.txt  0.959585     0.624323  0.756293
1  AUS_01_1946.txt  1.265239     0.605132  0.526760
2  BEL_01_1946.txt  1.170130     0.686274  0.631692
3  BLR_01_1946.txt  0.798792     0.801597  1.002340
4  BOL_01_1946.txt  0.516098     0.570015  1.037704


In [82]:
# Load un_corpus_cleaned_final and merge with tot_df by filename 
un_corpus_cleaned_final = pd.read_csv(os.path.join(data_preprocessed, "un_corpus_cleaned_final.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_cleaned_final.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(data_results, "un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_results, "un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [83]:
print(un_corpus_scored)

              filename                                             speech  \
0      ARG_01_1946.txt  At the resumption of the first session of the ...   
1      AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2      BEL_01_1946.txt  The principal organs of the United Nations hav...   
3      BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4      BOL_01_1946.txt  Coming to this platform where so many distingu...   
...                ...                                                ...   
10947  WSM_79_2024.txt  Excellencies, I extend my congratulations to H...   
10948  YEM_79_2024.txt  Your Majesties, Excellencies, and Highnesses, ...   
10949  ZAF_79_2024.txt  President of the 79th Session of the UN Genera...   
10950  ZMB_79_2024.txt  YOUR EXCELLENCY PHILEMON YANG, PRESIDENT OF TH...   
10951  ZWE_79_2024.txt  Your Excellency, Mr. Philemon Yang, President ...   

      country_code  year  country_name  speech_length_words  \
0           

In [84]:
# Count where score is NaN
nan_count = un_corpus_scored['score'].isna().sum()

print("Count where score is NaN:", nan_count)

Count where score is NaN: 0
