# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Additional Analysis: Different individualised Stopwords - Data Creation

### Description: 

In the replication of Gennaro & Ash, their stopwords list was used. Since stopwords should be adapted for a different corpus, this script tests whether using a custom stopword list changes the results. As this does not affect the already cleaned corpus, the script starts with the clean corpus and adjusts the stopwords for preprocessing.



___

In [4]:
# == Import libraries for data processing and NLP ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path
from rapidfuzz import process, fuzz

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [5]:
# === Set Working Directory and create folder structure ===

# --- Set base path to project root ---
base_path = Path.cwd().parents[2]  # project root
print(f"Project root set to: {base_path}")

# --- Paths ---
data_c = base_path / "data"

# Define paths
data_dict = data_c / "dictionaries"
data_stopwords = data_c / "stopwords"
data_freq = data_c / "freq"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
data_temp = data_c / "temp"


Project root set to: C:\Users\sarah\Downloads\TESTRUN


In [6]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\Downloads\TESTRUN\notebooks\Additional Analysis\Individual Stopwords


___

## Load Corpus

In [9]:
clean_files = [
    os.path.join(data_temp, 'clean_speeches_indexed1.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed2.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed3.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed4.pkl')
]

In [10]:
df_clean = pd.read_pickle(data_temp / "df_clean.pkl")

___

### Text Pre-Processing

In [13]:
print(os.getcwd())

C:\Users\sarah\Downloads\TESTRUN\notebooks\Additional Analysis\Individual Stopwords


In [14]:
# Get SpaCy stopwords
SPACY_STOPWORDS = list(nlp.Defaults.stop_words)

# Stem SpaCy stopwords and convert to set
stemmed_spacy = set(stemmer.stem(w) for w in SPACY_STOPWORDS)

exclude_words = {"please", "empti", "somehow", "anyhow", "somewher"}  # can add more
stemmed_spacy -= exclude_words

# Already-stemmed custom stopwords
my_stemmed_stopwords = {"year", "time", "member", "session", "work", "oper", "united", "asia", "africa", "america", "europe", "task",
                        "nation", "south", "east", "north", "west", "countri", "deleg", "project",
                        "state", "peopl", "general", "organ", "assembl",
                        "way", "role", "present"}

# Merge sets and sort to get a list

STEMMED_STOPWORDS = sorted(stemmed_spacy.union(my_stemmed_stopwords))

stopwords_path = os.path.join(data_stopwords, "ind_stopwords.pkl")
joblib.dump(STEMMED_STOPWORDS, stopwords_path)

stopwords = set(joblib.load(stopwords_path))

print(f"Saved {len(STEMMED_STOPWORDS)} stemmed stopwords to {stopwords_path}")
print(STEMMED_STOPWORDS[:30])

Saved 321 stemmed stopwords to C:\Users\sarah\Downloads\TESTRUN\data\stopwords\ind_stopwords.pkl
["'d", "'m", "'s", 'a', 'about', 'abov', 'across', 'africa', 'after', 'afterward', 'again', 'against', 'all', 'almost', 'alon', 'along', 'alreadi', 'also', 'although', 'alway', 'am', 'america', 'among', 'amongst', 'amount', 'an', 'and', 'ani', 'anoth', 'anyon']


In [15]:
# == Functions to remove punctioation, tokenize, lowercase, pure digit tokens, words shorter than 2 letters, POS-Tagging, stemm, stopword removal ==

def pro1(lista):
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
        a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
        return a
    
def pro4(lista):
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]


def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]  # tag each tokenlist
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t
    
def pro5(lista):
    return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
    
def pro6(lista):
    return [[row[0], [w for w in row[1] if w not in stopwords]] for row in lista]

def dropnull(lista):
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [16]:
# == Create full pre-processing function and run it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
    data = pro6(data)
    
    data = dropnull(data)

    filename_preprocessed = data_name.replace('clean_speeches_', 'ind_stopwords_preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in clean_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl...
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl] Before tagging: 28.82s
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl] After tagging: 555.83s
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\Downloads\TESTRUN\data\preprocessed\ind_stopwords_preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed1.pkl] Done. Total time: 691.21s

Starting preprocessing for C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed2.pkl...
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed2.pkl] Before tagging: 24.20s
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed2.pkl] After tagging: 479.93s
[C:\Users\sarah\Downloads\TESTRUN\data\temp\clean_speeches_indexed2.pkl] Saved stemmed version: C:\Users\sarah\Downloa

In [17]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed4.pkl')
]

In [18]:
# Load all preprocessed pickle files
preprocessed_data = []
for f in preprocessed_files:
    preprocessed_data.extend(joblib.load(f))

# Turn into DataFrame
df_preprocessed = pd.DataFrame(preprocessed_data, columns=["filename", "speech_preprocessed"])

# Merge into df_clean
ind_stopwords_df_clean = df_clean.merge(df_preprocessed, on="filename", how="left")

print(ind_stopwords_df_clean.head())


          filename                                             speech  \
0  ARG_01_1946.txt  At the resumption of the first session of the ...   
1  AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2  BEL_01_1946.txt  The principal organs of the United Nations hav...   
3  BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4  BOL_01_1946.txt  Coming to this platform where so many distingu...   

  country_code  year country_name  speech_length_words  \
0          ARG  1946    Argentina                 3364   
1          AUS  1946    Australia                 4531   
2          BEL  1946      Belgium                 2501   
3          BLR  1946      Belarus                 3055   
4          BOL  1946      Bolivia                 1501   

   english_official_language  security_council_permanent        speaker_name  \
0                          0                           0            Mr. Arce   
1                          0                        

In [19]:
# == New variable: Speech length of the preprocessed corpus ==

# Count tokens in preprocessed speech
ind_stopwords_df_clean["speech_length_preprocessed"] = ind_stopwords_df_clean["speech_preprocessed"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(ind_stopwords_df_clean[["filename", "speech_length_preprocessed"]].head())
all_tokens = [token for speech in ind_stopwords_df_clean["speech_preprocessed"].dropna() for token in speech]
unique_tokens = set(all_tokens)
print("Total unique tokens:", len(unique_tokens))

# Average length of preprocessed speeches
average_length = ind_stopwords_df_clean["speech_length_preprocessed"].mean()

print(f"Average number of tokens per speech: {average_length:.2f}")

          filename  speech_length_preprocessed
0  ARG_01_1946.txt                        1207
1  AUS_01_1946.txt                        1656
2  BEL_01_1946.txt                         964
3  BLR_01_1946.txt                        1232
4  BOL_01_1946.txt                         517
Total unique tokens: 39996
Average number of tokens per speech: 1174.44


In [20]:
print(ind_stopwords_df_clean.head())

          filename                                             speech  \
0  ARG_01_1946.txt  At the resumption of the first session of the ...   
1  AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2  BEL_01_1946.txt  The principal organs of the United Nations hav...   
3  BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4  BOL_01_1946.txt  Coming to this platform where so many distingu...   

  country_code  year country_name  speech_length_words  \
0          ARG  1946    Argentina                 3364   
1          AUS  1946    Australia                 4531   
2          BEL  1946      Belgium                 2501   
3          BLR  1946      Belarus                 3055   
4          BOL  1946      Bolivia                 1501   

   english_official_language  security_council_permanent        speaker_name  \
0                          0                           0            Mr. Arce   
1                          0                        

In [21]:
print(os.getcwd())

C:\Users\sarah\Downloads\TESTRUN\notebooks\Additional Analysis\Individual Stopwords


---

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [25]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

#def remove_rare_words(filenames, freqs, min_count=10):
   # for fname in filenames:
       # data = joblib.load(fname)
       # filtered_data = []
        #for doc_id, tokens in data:
          #  filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
          #  filtered_data.append([doc_id, filtered_tokens])
       # joblib.dump(filtered_data, fname)  # overwrite or save as new file
       # print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
ind_stopwords_word_counts = count_frequencies(preprocessed_files)

#remove_rare_words(preprocessed_files, word_counts, min_count=10)

print("\n[Stemmed] Top 50 most common words:")
for word, count in ind_stopwords_word_counts.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in ind_stopwords_word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'ind_stopwords_word_counts.pkl')
joblib.dump(ind_stopwords_word_counts, save_path)

100%|██████████| 4/4 [00:23<00:00,  5.87s/it]



[Stemmed] Top 50 most common words:
unit: 187107
intern: 161305
develop: 147090
peac: 134933
world: 133091
secur: 86024
govern: 75551
econom: 73475
right: 67482
new: 59536
effort: 57570
human: 57178
problem: 56809
support: 55692
continu: 53696
communiti: 49388
region: 48963
polit: 48469
war: 41867
need: 41509
council: 41193
import: 40881
achiev: 39487
power: 38401
hope: 38377
conflict: 37632
presid: 37232
situat: 36254
principl: 36246
global: 36179
resolut: 35258
republ: 34661
forc: 34483
great: 34066
relat: 33847
order: 33512
concern: 33294
action: 32668
nuclear: 32246
solut: 32199
establish: 31713
confer: 31362
polici: 30730
commit: 30720
social: 30684
respect: 30371
effect: 30316
independ: 29254
chang: 28912
interest: 28268

[Stemmed] Top 50 least common words:
salway: 1
shshout: 1
montremontreux: 1
navinavig: 1
wbecam: 1
mmadam: 1
aattack: 1
paa: 1
desexu: 1
nhuman: 1
inshallah: 1
ffnpt: 1
fakafetai: 1
lasi: 1
bbt: 1
highemiss: 1
nabbanja: 1
robinah: 1
llife: 1
necconnext: 1
juste

['C:\\Users\\sarah\\Downloads\\TESTRUN\\data\\freq\\ind_stopwords_word_counts.pkl']

In [26]:
num_unique_words = len(ind_stopwords_word_counts)
print(f"Number of unique words: {num_unique_words}")

Number of unique words: 39996


In [27]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\Downloads\TESTRUN\data


### Count the frequency of the dictionary words

In [29]:
affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

# == Count dictionary words

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

a_list = [[i, ind_stopwords_word_counts[i]] for i in affect if i in ind_stopwords_word_counts]
c_list = [[i, ind_stopwords_word_counts[i]] for i in cognition if i in ind_stopwords_word_counts]

a_list = sorted(a_list, key=lambda x: x[1], reverse=True)
c_list = sorted(c_list, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a_list]
c = [[i[0], f"({i[1]}),"] for i in c_list]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "ind_stopwords_affect_words.txt")
cog_out_path = os.path.join(data_freq, "ind_stopwords_cog_words.txt")

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

# number of affect/cognitive words that appear in word_counts
num_affect_words = len(a_list)
num_cog_words = len(c_list)

# Dictionary words that appear less than 10 times
num_affect_lt10 = sum(1 for _, count in a_list if count < 10)
num_cog_lt10 = sum(1 for _, count in c_list if count < 10)

print(f"Unique affect words in text: {num_affect_words}")
print(f"Unique cognition words in text: {num_cog_words}")
print(f"Affect words with count < 10: {num_affect_lt10}")
print(f"Cognition words with count < 10: {num_cog_lt10}")


# == Calculate weighted frequencies for all words

l = sum(ind_stopwords_word_counts.values())

a = 0.001 # Method to downweight with a smoothing parameter: For frequent words (large v/1), weight approaches 0; for rare words (small v/1) closer to 1
ind_stopwords_word_counts_weighted = {k: a / (a + (v / l)) for k, v in ind_stopwords_word_counts.items()}

joblib.dump(ind_stopwords_word_counts_weighted, os.path.join(data_freq, 'ind_stopwords_word_counts_weighted.pkl'))

# To print top 50 by weighted values, sort the dictionary by value descending:
top_50_weighted = sorted(ind_stopwords_word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:50]

print("Top 50 words by weighted frequency:")
for word, weight in top_50_weighted:
    print(f"{word}: {weight}")

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

---

## Final Cleaning

In [32]:
os.chdir(data_freq)

ind_stopwords_word_counts = joblib.load('ind_stopwords_word_counts.pkl')  # load stemmed counts
# For each speech only keep tokens that appear at least 10x

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if ind_stopwords_word_counts.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    cleaned_path = data_path.replace('.pkl', '_final.pkl')
    joblib.dump(data, cleaned_path)

In [36]:
os.chdir(data_preprocessed)

final_files = [
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'ind_stopwords_preprocessed_speeches_indexed4_final.pkl')
]

final_data = []
for fname in final_files:
    final_data.extend(joblib.load(fname))

# Merge with df_merged
ind_stopwords_df_final = pd.DataFrame(final_data, columns=["filename", "speech_final"])
ind_stopwords_df_final = df_clean.merge(ind_stopwords_df_final, on="filename", how="left")

# Create speech_length_final column
ind_stopwords_df_final["speech_length_final"] = ind_stopwords_df_final["speech_final"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(ind_stopwords_df_final[["filename", "speech_length_final"]].head())

all_tokens_final = [token for speech in ind_stopwords_df_final["speech_final"].dropna() for token in speech]
unique_tokens_final = set(all_tokens_final)
print("Total unique tokens across all final speeches:", len(unique_tokens_final))

print("Average tokens per final speech:", ind_stopwords_df_final["speech_length_final"].mean())

# Save as pickle
joblib.dump(ind_stopwords_df_final, os.path.join(data_preprocessed, "un_corpus_cleaned_final_ind_stopwords.pkl"))

# Save as CSV
ind_stopwords_df_final.to_csv(
    os.path.join(data_preprocessed, "un_corpus_cleaned_final_ind_stopwords.csv"),
    sep=';',
    index=False,
    encoding='utf-8'
)

          filename  speech_length_final
0  ARG_01_1946.txt                 1188
1  AUS_01_1946.txt                 1656
2  BEL_01_1946.txt                  963
3  BLR_01_1946.txt                 1229
4  BOL_01_1946.txt                  511
Total unique tokens across all final speeches: 12495
Average tokens per final speech: 1168.859203798393
