# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Additional Analysis: Different Calculation of Weighted Frequencies - Model Traning, Calculation Centroids & Speech Scoring

### Description

In the replication package, weighted frequencies are calculated on the full preprocessed corpus (35,009 unique words; 4,500,778 tokens), while the embedding corpus drops words occurring fewer than 10 times (9,453 unique words; 4,286,666 tokens). This script examines whether calculating weighted frequencies after removing these low-frequency words makes any difference. The sentence split from the normal script is used as this does not change anything regarding the weighted frequencies.

In [12]:
# == Import libraries for data processing and NLP ==

import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")

# === Set Working Directory ===

# --- Set base path to project root ---
base_path = Path.cwd().parents[2]  # project root
print(f"Project root set to: {base_path}")

# --- Paths ---
data_c = base_path / "data"

# === Define Folder Paths ===

data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_results = data_c / "results"
data_models = data_c / "models" 
data_stopwords = data_c / "stopwords"

# === Upload ressources ===
stopwords = joblib.load(data_stopwords / "stopwords_procedural_words.pkl")         
word_counts = joblib.load(data_freq / "removed_lowfreq_words_word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "removed_lowfreq_words_word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl') 

Project root set to: C:\Users\sarah\Downloads\TESTRUN


___

### Sentence Split

In [15]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]

---

### Train Word2Vec

In [18]:
w2v = Word2Vec.load(str(data_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

---

### Calculate Centroids

In [21]:
# == Calculation ==
def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] 
        for w in text 
        if w in model.wv and w in word_counts_weighted]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/removed_lowfreq_words_affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/removed_lowfreq_words_cog_centroid.pkl')

# == Overview Vectors ==
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

Affect centroid vector:
 [ 0.13064627 -0.04827924 -0.15153086 -0.08612831 -0.08926757 -0.10166314
  0.04530697 -0.03991153 -0.00713518  0.03938422 -0.12288937  0.12812537
 -0.09471986 -0.15114433 -0.01842675  0.02643549 -0.02811609 -0.01509959
 -0.03058211  0.15414318 -0.08748648 -0.12533641 -0.00183108 -0.0165664
 -0.04048407  0.05171427 -0.07015131 -0.08836247 -0.11619954  0.27276504
 -0.02632928 -0.00770855 -0.02262908  0.07029717  0.15877137 -0.11152443
 -0.04099195  0.01238179 -0.1206511   0.15178439  0.02092024 -0.09568062
 -0.08418883  0.03344161 -0.09072608 -0.06660308  0.18288039  0.10588247
 -0.01280115 -0.02577296 -0.03012501  0.02465636 -0.16379587  0.00740187
  0.10084286 -0.01439868  0.18934949 -0.13898978 -0.01534871  0.29372498
 -0.11067255  0.00598936  0.05583415 -0.14968625  0.04392968 -0.07589418
  0.1992344  -0.23371579 -0.02409786 -0.13608864 -0.16311853  0.03449263
  0.03842949  0.01427503  0.04528952 -0.00929075  0.10415114 -0.02463074
 -0.03293241  0.03970418  0

---

### Emotionality Scoring

In [31]:
os.chdir(data_preprocessed)

# Load preprocessed speech data 

preprocessed_final_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl')
]

In [33]:
# Function to compute weighted document vectors and derive affective/cognitive distances and scores

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        # Compute weighted word vectors for each token present in the Word2Vec model
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            # Compute mean vector for each speech
            v = np.mean(vecs, axis=0)
             # Cosine distance to affective centroid
            a = cosine(v, affect_centroid)
            # Cosine distance to cognitive centroid
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_results, f'removed_lowfreq_words_distances_main_{idx}.pkl')
    joblib.dump(data, lab)

# Main loop: process all preprocessed speech files
def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()

# Merge all distance files into one df
DATA_temp = [os.path.join(data_results, f'removed_lowfreq_words_distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_results, 'removed_lowfreq_words_distances_10epochs.pkl'))

['C:\\Users\\sarah\\Downloads\\TESTRUN\\data\\results\\removed_lowfreq_words_distances_10epochs.pkl']

In [34]:
print(tot_df.head())

          filename  affect_d  cognition_d     score
0  ARG_01_1946.txt  0.958573     0.624628  0.757196
1  AUS_01_1946.txt  1.264760     0.605460  0.527228
2  BEL_01_1946.txt  1.169188     0.686573  0.632552
3  BLR_01_1946.txt  0.797913     0.801657  1.003124
4  BOL_01_1946.txt  0.515044     0.569711  1.038221


In [43]:
un_corpus_cleaned_final = pd.read_csv(os.path.join(data_preprocessed, "un_corpus_cleaned_final.csv"), sep=';', encoding='utf-8') 
un_corpus_scored = un_corpus_cleaned_final.merge(tot_df, on="filename", how="left")

joblib.dump(un_corpus_scored, os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_results, "removed_lowfreq_words_un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [45]:
print(un_corpus_scored)

              filename                                             speech  \
0      ARG_01_1946.txt  At the resumption of the first session of the ...   
1      AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2      BEL_01_1946.txt  The principal organs of the United Nations hav...   
3      BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4      BOL_01_1946.txt  Coming to this platform where so many distingu...   
...                ...                                                ...   
10947  WSM_79_2024.txt  Excellencies, I extend my congratulations to H...   
10948  YEM_79_2024.txt  Your Majesties, Excellencies, and Highnesses, ...   
10949  ZAF_79_2024.txt  President of the 79th Session of the UN Genera...   
10950  ZMB_79_2024.txt  YOUR EXCELLENCY PHILEMON YANG, PRESIDENT OF TH...   
10951  ZWE_79_2024.txt  Your Excellency, Mr. Philemon Yang, President ...   

      country_code  year  country_name  speech_length_words  \
0           

In [47]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

print("Count where affect_d is NaN:", nan_count)

Count where affect_d is NaN: 0
