# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 1: Model Traning, Calculation Centroids & Speech Scoring
### Author: Sarah Franzen

In [1]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
#wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
#os.chdir(wd)

# Set your working directory (adjust this as needed)
wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")

# === Create new Folders ===
additional_folders = ["results", "models"]

# Create/check folders directly in wd
for folder in additional_folders:
    folder_path = Path(wd) / folder
    folder_path.mkdir(exist_ok=True)
    print(f"Folder checked/created: {folder_path}")

# === Define Folder Paths ===

# Folders were already created in the script 0_data_creation
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
fig_dir = wd /"fig"

wd_results = wd / "results" # THESE FOLDERS NEED TO BE CREATED
wd_models = wd / "models" # THESE FOLDERS NEED TO BE CREATED

# Upload ressources
#stopwords = joblib.load(data_c / "stopwords.pkl")              #### this is from the replication package! Issue for replication
stopwords = joblib.load(data_c / "spacy_stopwords_stemmed.pkl")
word_counts = joblib.load(data_freq / "word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')  ### same issue here!
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') ### same issue here! must also be fixed in 0_data_creation

Folder checked/created: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\results
Folder checked/created: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\models


In [2]:
os.chdir(data_temp)
cleaned_files = [
    str(data_temp / 'clean_speeches_indexed1.pkl'),
    str(data_temp / 'clean_speeches_indexed2.pkl'),
    str(data_temp / 'clean_speeches_indexed3.pkl'),
    str(data_temp / 'clean_speeches_indexed4.pkl')
]

___

### Sentence Split

In [5]:
os.chdir(data_sent)

# Function to split cleaned speeches (clean_speeches) into sentences, tokenize, clean, tag, stem, filter, and save them.

def extract_sentences(dataname):
    data = joblib.load(dataname)
    data = [a[1] for a in data]  # keep only text, no id

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc)  # use nltk's sent_tokenize here

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if word_counts[a] >= 10] for s in sentences]

    dropped_count = sum(1 for s in sentences if len(s) <= 1)
    print(f"Number of very short sentences being dropped: {dropped_count}")

    sentences = [s for s in sentences if len(s) > 1]
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    
    unique_tokens = set(token for s in sentences for token in s)
    print(f"Unique tokens: {len(unique_tokens)}")

    # Print preview of first 5 processed sentences
    print("Example sentences (first 5):")
    for s in sentences[:5]:
        print(s)


    print(f'{lab} saved')

for fname in cleaned_files:
    extract_sentences(fname)

Number of very short sentences being dropped: 4244
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl processed
Unique tokens: 12330
Example sentences (first 5):
['excel', 'report', 'work', 'organ', 'submit', 'open', 'session', 'eloqu', 'point', 'view', 'congratul']
['deploy', 'montenegrin', 'soldier', 'polic', 'offic', 'intern', 'mission', 'testifi', 'readi', 'fulfil', 'intern', 'oblig', 'develop', 'relat', 'base', 'partnership']
['nepal', 'continu', 'believ', 'new', 'regim', 'meaning', 'recogn', 'entir', 'resourc', 'sea', 'belong', 'human', 'constitut', 'common', 'heritag', 'mankind']
['exacerb', 'destruct', 'ecosystem', 'biodivers', 'climat', 'chang', 'like', 'increas', 'poverti', 'diseas', 'lead', 'upsurg', 'climat', 'relat', 'migrat', 'compromis', 'futur', 'futur', 'generat']
['chines', 'threat', 'fact', 'alibi', 'western', 'colonialist', 'imperialist', 'enterpris', 'african', 'asian', 'latin', 'american', 'countri', 'pay', 'price']
C:\Users\sarah

In [40]:
# Example: pick the first file to see how the sentence split looks like
file_path = os.path.join(data_temp, 'sentences_indexed1.pkl')

# Load the pickle
sentences = joblib.load(file_path)

print("Example sentences (first 5):")
for s in sentences[:5]:
    print(s)

Example sentences (first 5):
['excel', 'report', 'work', 'organ', 'submit', 'open', 'session', 'eloqu', 'point', 'view', 'congratul']
['deploy', 'montenegrin', 'soldier', 'polic', 'offic', 'intern', 'mission', 'testifi', 'readi', 'fulfil', 'intern', 'oblig', 'develop', 'relat', 'base', 'partnership']
['nepal', 'continu', 'believ', 'new', 'regim', 'meaning', 'recogn', 'entir', 'resourc', 'sea', 'belong', 'human', 'constitut', 'common', 'heritag', 'mankind']
['exacerb', 'destruct', 'ecosystem', 'biodivers', 'climat', 'chang', 'like', 'increas', 'poverti', 'diseas', 'lead', 'upsurg', 'climat', 'relat', 'migrat', 'compromis', 'futur', 'futur', 'generat']
['chines', 'threat', 'fact', 'alibi', 'western', 'colonialist', 'imperialist', 'enterpris', 'african', 'asian', 'latin', 'american', 'countri', 'pay', 'price']


In [8]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

In [9]:
# == Get sum of unique tokens ==
all_unique_tokens = set()

for dataname in sentences_files:
    data = joblib.load(dataname)  # load list of tokenized sentences
    for sentence in data:
        all_unique_tokens.update(sentence)  # add tokens to the set

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")

sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

Total unique tokens across all files: 12498


---

### Train Word2Vec

In [12]:
dataset = []

for dataname in sentences_files:  
    data = joblib.load(dataname)
    dataset.extend(data) 

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    
    vector_size=300,      # Dimension of the vector
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=8,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10             # Number of iterations over the corpus
)

# Optimize memory usage (optional)
w2v.wv.fill_norms() 

# Save model
wd_models.mkdir(parents=True, exist_ok=True) 
w2v.save(str(wd_models / 'w2v-vectors_8_300.pkl'))

In [13]:
w2v = Word2Vec.load(str(wd_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [16]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] for w in text if w in model.wv]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [-1.02785610e-01 -3.07402238e-02 -8.21711794e-02  2.37924665e-01
  1.66343763e-01  9.10846964e-02  2.84846932e-01  3.62272322e-01
 -1.21231005e-02 -2.37350658e-01  1.68229178e-01  2.27351151e-02
 -2.01144174e-01 -5.34671731e-02 -1.88858196e-01  5.76852411e-02
  1.32371739e-01 -1.18452549e-01  2.54693478e-01  6.14628419e-02
 -9.50220898e-02  1.36210412e-01  1.69262066e-01 -1.75394211e-02
  7.08577633e-01 -7.31074139e-02 -4.02246922e-01  1.69142440e-01
 -1.37794316e-01 -1.69568956e-02 -2.75227338e-01 -6.70964569e-02
 -2.91324228e-01  1.91873133e-01  7.30456039e-02  1.27081797e-01
 -4.53694351e-02 -3.80070716e-01  2.32767358e-01 -1.37659488e-02
 -8.08558799e-03  1.33415192e-01  2.34713301e-01 -1.43392488e-01
  2.69387811e-01  4.90580872e-02 -2.19094664e-01 -1.67150795e-03
  5.07661700e-02  3.63590121e-01  4.79958160e-03 -1.55586442e-02
 -7.94477463e-02  2.31407508e-01  1.23839326e-01  2.73110747e-01
  3.45264286e-01  4.84697483e-02  2.22315222e-01  3.27853709e-01


---

### Emotionality Scoring

In [19]:
# Set wd to data_preprocessed
os.chdir(data_preprocessed)

# Load preprocessed speech data # NOR SURE IF USING THIS FILE IS CORRECT

preprocessed_final_files = [
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl'))
]


In [20]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(wd_results, f'distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into a single DataFrame
DATA_temp = [os.path.join(wd_results, f'distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(wd_results, 'distances_10epochs.pkl'))

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\results\\distances_10epochs.pkl']

In [21]:
# Print the first few rows
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  FIN_68_2013.txt  1.437249     1.099765  0.625116
1  JAM_30_1975.txt  1.154664     0.829715  0.722333
2  CRI_36_1981.txt  1.403022     1.015460  0.606352
3  IND_28_1973.txt  1.506429     1.039613  0.513929
4  COG_70_2015.txt  1.225038     1.215136  0.987384


In [22]:
# Load df_merged and merge with tot_df by filename 
un_corpus_merged = pd.read_csv(os.path.join(data_c, "un_corpus_merged.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_merged.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(wd_results, "un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(wd_results, "un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [23]:
print(un_corpus_scored)

              filename                                             speech  \
0      FIN_68_2013.txt  We have convened here \nin New York at a time ...   
1      JAM_30_1975.txt  152.\t Let me begin by expressing, on behalf o...   
2      CRI_36_1981.txt  I am happy to join previous speakers in congra...   
3      IND_28_1973.txt  ï»¿122.\tMr. President, I bring to you and to al...   
4      COG_70_2015.txt  His Excellency Mr. Denis Sassou Nguesso, Presi...   
...                ...                                                ...   
10947  LKA_72_2017.txt  I am very pleased to be able to congratulate t...   
10948  PNG_40_1985.txt  My delegation and I would like to congratulate...   
10949  COL_79_2024.txt  *"It is the hour of the peoples. If government...   
10950  CHN_04_1949.txt  Mr. Tsiang observed that the general debate at...   
10951  SGP_41_1986.txt   We meet this year under inauspicious circumst...   

      country_code  year      country_name  speech_length_words  \
0     

In [24]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

# Count where affect_d is not NaN
not_nan_count = un_corpus_scored['affect_d'].notna().sum()

print("Count where affect_d is NaN:", nan_count)
print("Count where affect_d is not NaN:", not_nan_count)

Count where affect_d is NaN: 0
Count where affect_d is not NaN: 10952
