# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 0: Data Cleaning, Preprocessing & Token Frequencies
### Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- The script will automatically ask you to set your working directory.
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the folder "UNGDC_1946_2024.tar.gz" from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*
- Additionally, you will be asked to download the following files from the Replication Package of Gennaro & Ash (https://zenodo.org/records/5748084):
    - procedural_words.pkl
    - stopwords.pkl
    - dictionary_affect.pkl
    - dictionary_cognition.pkl


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Create new variables
    - year, country_code, country_name
    - speech_length_words
    - english_official_language
    - security_council_permanent
    - gender
    - position
    - speaker_name
    - Country (Year)
- Cleaning
    - remove line breaks, hypehnation etc.
- Preprocessing
    - remove punctuation, tokenize, lowercase, pure digit tokens, words shorter than 2 letters, POS-Tagging, stemm, stopword removal
    - create new variable: speech_length_preprocessd
- Word Frequencies
    - word counts of the preprocessed_corpus
    - count frequency of the dictionary words
    - calculate weighted frequency
- Final preprocessing 
    - Remove words that appear less than 10x times from the preprocessed corpus 

___

## Setup, Installation of required Packages and Libraries & Folder Structure

In [24]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "matplotlib",
        "tqdm",
        "seaborn",
        "joblib",
        "scipy",
        "tabulate",
        "rapidfuzz",
        "tableone"
        
    ]

    for package in packages:
        if importlib.util.find_spec(package) is None:
            !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg') 

In [25]:
# == Import libraries for data processing and NLP ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path
from rapidfuzz import process, fuzz

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [26]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
wd = Path(input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip())

# Change to the entered working directory

#wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)


# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp", "models", "results"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

# Create additional folders in the base directory
additional_folders = ["fig", "tables"]
for folder in additional_folders:
    (base_path / folder).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")
for folder in additional_folders:
    print(f"- {folder}")

# Prompt user to place raw data files
print(
    f"\nPlease place your raw data files (unzipped) into the folder:\n"
    f"  {data_path / 'data_original'}\n"
    f"and store the files 'dictionaries_affect' and 'dictionaries_cognition' into the folder:\n"
    f"  {data_path / 'dictionaries'}\n"
    f"Also, place the 'stopwords' file into the data folder:\n"
    f"  {data_path}"
)


input("Press Enter after you have placed the files to continue...")

print("Continuing with the script...")

Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit

Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
  - models
  - results
- fig
- tables

Please place your raw data files (unzipped) into the folder:
  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\data_original
and store the files 'dictionaries_affect' and 'dictionaries_cognition' into the folder:
  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\dictionaries
Also, place the 'stopwords' file into the data folder:
  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


Press Enter after you have placed the files to continue... 


Continuing with the script...


In [27]:
# === Define Folder Paths ===

# If an error occurs, make sure that you actually have these folders in your working directory

data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"

In [28]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

## Load and Prepare Corpus

In [31]:
# == Load the txt-files from the UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\TXT"

all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"Total speeches found: {len(all_txt_files)}")

raw_data = []
for filepath in all_txt_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

df_raw = pd.DataFrame(raw_data)

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

# Drop empty speeches
df_raw['speech'] = df_raw['speech'].astype(str)
df_raw = df_raw[df_raw['speech'].str.strip() != ''].copy()

dupe_labels = df_raw[df_raw.duplicated(subset=['filename', 'speech'], keep=False)]
print(dupe_labels[['filename', 'speech']].head(20))

# == Store as csv and pkl ==

raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


Total speeches found: 10953
Empty DataFrame
Columns: [filename, speech]
Index: []

 Saved raw data with 10952 speeches to '.\data\un_corpus_raw.csv'


In [32]:
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")
df_raw.head()         

Unnamed: 0,filename,speech
0,ARG_01_1946.txt,At the resumption of the first session of the ...
1,AUS_01_1946.txt,The General Assembly of the United Nations is ...
2,BEL_01_1946.txt,The\tprincipal organs of the United Nations ha...
3,BLR_01_1946.txt,As more than a year has elapsed since the Unit...
4,BOL_01_1946.txt,Coming to this platform where so many distingu...


___

## Create new variables

#### New Variables: Year, Country Code and Country Name

In [36]:
# == Create variable: country code & year ==

# Create contry_code and year variable
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())
# Speeches range from 1946 to 2024

# == Create variable: country_name by matching ISO country code 
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and iso codes that were causing missings when running before
custom_names = {
    "BOL": "Bolivia",
    "COD": "Democratic Republic of Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "South Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
}

code_to_name.update(custom_names)
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

Min year: 1946
Max year: 2024
Missing codes: []


In [37]:
# == Check country names and structure ==

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
pd.reset_option('display.max_rows')

    country_code                      country_name
0            AFG                       Afghanistan
1            AGO                            Angola
2            ALB                           Albania
3            AND                           Andorra
4            ARE              United Arab Emirates
5            ARG                         Argentina
6            ARM                           Armenia
7            ATG               Antigua and Barbuda
8            AUS                         Australia
9            AUT                           Austria
10           AZE                        Azerbaijan
11           BDI                           Burundi
12           BEL                           Belgium
13           BEN                             Benin
14           BFA                      Burkina Faso
15           BGD                        Bangladesh
16           BGR                          Bulgaria
17           BHR                           Bahrain
18           BHS               

#### New Variable: Length of speeches

In [39]:
# Count total number of unique tokens in the corpus
all_tokens = set()
for speech in df_raw['speech']:
    all_tokens.update(str(speech).split())
print("Total number of unique tokens in the corpus:", len(all_tokens))

# Count total number of tokens in the corpus
total_tokens = df_raw['speech'].apply(lambda x: len(str(x).split())).sum()
print("Total number of tokens in the corpus:", total_tokens)

# Add new column: speech length in words
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(str(x).split()))

# Calculate average length
avg_length = df_raw['speech_length_words'].mean()
print("Average speech length (words):", round(avg_length, 2))

# 20 shortest & longest speeches
print("20 shortest speeches:")
print(df_raw.nsmallest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

print("\n20 longest speeches:")
print(df_raw.nlargest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

Total number of unique tokens in the corpus: 225938
Total number of tokens in the corpus: 31911386
Average speech length (words): 2913.75
20 shortest speeches:
              filename                      country_name  year  \
2044   EGY_28_1973.txt                             Egypt  1973   
3961   VCT_41_1986.txt  Saint Vincent and the Grenadines  1986   
9221   BEN_71_2016.txt                             Benin  2016   
10248  GNB_76_2021.txt                     Guinea-Bissau  2021   
17     HTI_01_1946.txt                             Haiti  1946   
10329  RWA_76_2021.txt                            Rwanda  2021   
4468   DDR_45_1990.txt                      East Germany  1990   
9702   LTU_73_2018.txt                         Lithuania  2018   
8969   RWA_69_2014.txt                            Rwanda  2014   
19     IRN_01_1946.txt                              Iran  1946   
9506   LTU_72_2017.txt                         Lithuania  2017   
9163   RWA_70_2015.txt                          

#### New variable: English as Official Language

In [41]:
# Source for english as official language : https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

df_raw['english_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members

# Check df with new variable english_official_language
df_raw.head()

Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
Turks and Caicos Islands


Unnamed: 0,filename,speech,country_code,year,country_name,speech_length_words,english_official_language
0,ARG_01_1946.txt,At the resumption of the first session of the ...,ARG,1946,Argentina,3364,0
1,AUS_01_1946.txt,The General Assembly of the United Nations is ...,AUS,1946,Australia,4531,0
2,BEL_01_1946.txt,The\tprincipal organs of the United Nations ha...,BEL,1946,Belgium,2501,1
3,BLR_01_1946.txt,As more than a year has elapsed since the Unit...,BLR,1946,Belarus,3055,0
4,BOL_01_1946.txt,Coming to this platform where so many distingu...,BOL,1946,Bolivia,1501,0


#### New variable: Permanent member security council

In [43]:
# Define permanent members of the UN Security Council and create dummy
permanent_members = ['RUS', 'USA', 'FRA', 'GBR', 'CHN']

df_raw['security_council_permanent'] = df_raw['country_code'].isin(permanent_members).astype(int)

print(df_raw[df_raw['country_code'].isin(permanent_members)][
    ['country_code', 'country_name', 'security_council_permanent', 'year']
])

      country_code    country_name  security_council_permanent  year
8              CHN           China                           1  1946
14             FRA          France                           1  1946
15             GBR  United Kingdom                           1  1946
30             RUS          Russia                           1  1946
36             USA   United States                           1  1946
...            ...             ...                         ...   ...
10791          CHN           China                           1  2024
10817          FRA          France                           1  2024
10820          GBR  United Kingdom                           1  2024
10905          RUS          Russia                           1  2024
10941          USA   United States                           1  2024

[388 rows x 4 columns]


#### New variables: Speaker, Position & Gender

In [45]:
# Load suplementary data set which contains information on the speakers name (sometimes including gender) and their position
df_speakers = pd.read_excel(os.path.join(data_c, "data_original", "Speakers_by_session.xlsx"))

print(df_speakers.duplicated(subset=['Year', 'ISO Code']).sum())

dupes_speakers = df_speakers[df_speakers.duplicated(subset=['Year', 'ISO Code'], keep=False)]
print(dupes_speakers.sort_values(['Year', 'ISO Code']).head(20))

# For two observations the noted speakers differ, therefore an additional UN Resource was used to determine the real speaker
# for 1958 Iraq Mr. Jomard see https://digitallibrary.un.org/record/380721
# for 1954 Phillipines Mr. Romulo see https://digitallibrary.un.org/record/380429

df_speakers_cleaned = (
    df_speakers[~(
        ((df_speakers['ISO Code'] == "IRQ") & (df_speakers['Year'] == 1958) & (df_speakers['Name of Person Speaking'] == "Mr. Jawad")) |
        ((df_speakers['ISO Code'] == "PHL") & (df_speakers['Year'] == 1954) & (df_speakers['Name of Person Speaking'] == "Mr. SERRANO"))
    )]
    .drop_duplicates(subset=['Year', 'ISO Code'], keep='first')
)

print(df_speakers_cleaned.duplicated(subset=['Year', 'ISO Code']).sum())

# Note that the UdSSR carries RUS as country label

17
       Year  Session ISO Code                              Country  \
10699  1951        6      RUS  Union of Soviet Socialist Republics   
10745  1951        6      RUS  Union of Soviet Socialist Republics   
10574  1954        9      PHL                          Philippines   
10608  1954        9      PHL                          Philippines   
10454  1956       11      IRQ                                 Iraq   
10517  1956       11      IRQ                                 Iraq   
10516  1956       11      SYR                                Syria   
10519  1956       11      SYR                                Syria   
10398  1957       12      CSK                       Czechoslovakia   
10436  1957       12      CSK                       Czechoslovakia   
10352  1958       13      BGR                             Bulgaria   
10374  1958       13      BGR                             Bulgaria   
10320  1958       13      CSK                       Czechoslovakia   
10376  1958      

In [46]:
# == Create variable speaker_name and position ==

# Merge sublementary data to UNGDC df
df_merged = df_raw.merge(
    df_speakers_cleaned[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

unmatched = df_merged[df_merged['_merge'] == 'left_only']
unmatched_count = (df_merged['_merge'] == 'left_only').sum()

print(unmatched[['filename', 'year', 'country_code', 'country_name']])
print(f"{unmatched_count} rows could not be matched")

dupes_speakers = df_merged[df_merged.duplicated(subset=['year', 'country_code'], keep=False)]
print(dupes_speakers.sort_values(['year', 'country_code']).head(20))

# Clean up: 
#- Keep all rows, unmateched rows are being set to NA
#- Drop redundant columns and rename some columns
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge']).rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'})

# == Create gender dummy ==

df_merged['gender_dummy'] = df_merged['speaker_name'].apply(
    lambda name: 0 if pd.notnull(name) and re.search(r'^(?:Mr|Sir)\b', name, re.IGNORECASE)
    else 1 if pd.notnull(name) and re.search(r'^(?:Mrs|Ms)\b', name, re.IGNORECASE)
    else None)

# Count all values including NaN
counts = df_merged['gender_dummy'].value_counts(dropna=False)

# Build summary using .get() to handle missing keys
gender_summary = pd.DataFrame({
    'gender_dummy': ['0 (male)', '1 (female)', 'NaN (unknown)'],
    'count': [
        counts.get(0, 0),
        counts.get(1, 0),
        counts.get(np.nan, 0)
    ]
})

print(gender_summary)

              filename  year country_code    country_name
47     CSK_02_1947.txt  1947          CSK  Czechoslovakia
850    PRY_16_1961.txt  1961          PRY        Paraguay
895    EGY_17_1962.txt  1962          EGY           Egypt
940    PRT_17_1962.txt  1962          PRT        Portugal
967    BEL_18_1963.txt  1963          BEL         Belgium
...                ...   ...          ...             ...
9348   PRT_71_2016.txt  2016          PRT        Portugal
9398   ZAF_71_2016.txt  2016          ZAF    South Africa
9543   PRK_72_2017.txt  2017          PRK     North Korea
9544   PRT_72_2017.txt  2017          PRT        Portugal
10129  PRT_75_2020.txt  2020          PRT        Portugal

[84 rows x 4 columns]
84 rows could not be matched
Empty DataFrame
Columns: [filename, speech, country_code, year, country_name, speech_length_words, english_official_language, security_council_permanent, Year, ISO Code, Name of Person Speaking, Post, _merge]
Index: []
    gender_dummy  count
0       0

In [47]:
# Examine more detailled the observations that will be put to "Others"

monarch_pattern = r'(?i)\b(crown prince|prince|king|emir|amir|sheikh)\b'
head_of_state_pattern = r'(?i)^head of state\b'
head_of_government_pattern = r'(?i)^head of gover?ment\b'

is_monarch = df_merged['position'].str.contains(monarch_pattern, na=False)
is_head_of_state = df_merged['position'].str.contains(head_of_state_pattern, na=False)
is_head_of_government = df_merged['position'].str.contains(head_of_government_pattern, na=False)

mask = is_monarch | is_head_of_state | is_head_of_government

observations = df_merged.loc[mask, ['position', 'country_name', 'year']]

for _, row in observations.iterrows():
    print(f"{row['position']}: {row['country_name']}, {row['year']}")

# King can be Head of State and would now end up in the "Other" Variable, same for Sheikh or Emir

Prince: Thailand, 1954
King: Jordan, 1960
Prince: Cambodia, 1960
Prince: Morocco, 1960
Prince: Laos, 1963
Head of State: Nigeria, 1977
King: Jordan, 1979
Crown Prince: Jordan, 1981
Prince: Saudi Arabia, 1981
King of the Hasemite Kingdom of Jordan: Jordan, 1985
King : Spain, 1986
Prince: Saudi Arabia, 1986
King : Lesotho, 1987
Amir: Kuwait, 1988
Head of State: Lesotho, 1989
Amir: Kuwait, 1990
Emir: Kuwait, 1991
Prince and Head of State : Liechtenstein, 1991
King: Eswatini, 1991
Prince: Brunei Darussalam, 1992
Sheikh : Kuwait, 1992
Head of State: Liechtenstein, 1993
CROWN PRINCE: Monaco, 1993
Head of State and Commander-in-Chief of the Armed Forces: Nigeria, 1993
Head of State: Eswatini, 1993
Head of State: Brunei Darussalam, 1994
Head of State: Jordan, 1994
Head of State: Monaco, 1994
Head of State: Tajikistan, 1994
Head of State: Monaco, 1996
Head of State: Monaco, 1997
Head of State: Eswatini, 1997
Head of State: Monaco, 1998
Head of State: Nigeria, 1998
Head of State: Monaco, 1999
He

  is_monarch = df_merged['position'].str.contains(monarch_pattern, na=False)


In [48]:
count_head_of_state = is_head_of_state.sum()
count_head_of_government = is_head_of_government.sum()

print(f"Number of Head of State observations: {count_head_of_state}")
print(f"Number of Head of Government observations: {count_head_of_government}")


Number of Head of State observations: 71
Number of Head of Government observations: 0


In [49]:
# == Adjust position variable ==

print(df_merged['position'].unique())

# For speakers that have more than one position it appears that the higher position is always listed first, 
# therefore the second position will be dropped in the following function

# Since there are many expression for the position variable and it dos not seem to be unified, 
# the position variable is adjusted to enable more consitency

def normalize_position(pos):
    if pd.isna(pos):
        return pos

    pos = pos.strip()

    # Fix common typos and extra spaces
    pos = re.sub(r'\s+', ' ', pos) 
    pos_lower = pos.lower()

     # Turn all ministers that deal with foreign affairs and international relations to "Minister for Foreign Affairs"
    foreign_affairs_variants = [
        'minister for foregn affairs',
        'minister responsible for foreign affairs',
        'minsiter for foreign and caricom affairs',
        'minister for external affairs',
        'minister of external relations',  # <-- added
        'foreign minister',
        'minister for international affairs and cooperation',
        'minister for external relations',
        'federal minister for european and international affairs',
        'international cooperation',
        'federal minister for foreign affairs',
        'minister for foreign and caricom affairs',
        'minister of foreign affairs and cooperation',
        'minister for international relations and cooperation',
        'ministry of external relations',
        'acting minister for foreign affairs and international cooperation',
        'ministry of foreign affairs',
        'minister for foreign and political affairs',
        'federal minister for europe, integration, and foreign affairs',
        'federal minister for europe, integration and foreign affairs',
        'minister of foreign and european affaris',
        'minister of foreign affairs',
        'minister for foreign',
        'minister of foreign and european affairs and minister of immigration and asylum',
        'minister for foreign affairs and senegalese living abroad',
        'minister for foreign affairs with responsibility for brexit',
        'minister for foreign affairs and investment promotion'
       
    ]
    if any(variant in pos_lower for variant in foreign_affairs_variants):
        return "Minister for Foreign Affairs"

    pos = re.sub(r'(?i)\brime[- ]?minister\b', 'Prime Minister', pos)

    # Normalize different versions of Head of Government, President, Prime Minsiter and Vice-President-
    exact_matches = {
        r'(?i)^president of (the )?government$': 'Head of Government',
        r'(?i)^acting president$': 'President',
        r'(?i)^interim president$': 'President',
        r'(?i)^constitutional president$': 'President',
        r'(?i)^first executive president$': 'President',
        r'(?i)^first prime[- ]?minister$': 'Prime Minister',
        r'(?i)^head of the goverment$': 'Head of Government',  # <-- catch typo + spaces
        r'(?i)^head\s+of\s+govern?ment$': 'Head of Government',
        r'(?i)^first vice[- ]?president$': 'Vice-President'
    }
    for pattern, replacement in exact_matches.items():
        if re.fullmatch(pattern, pos):
            return replacement

    # Normalize prefixes
    pos = re.sub(r'(?i)^first vice[- ]?president\b', 'Vice-President', pos)
    pos = re.sub(r'(?i)\bprime[- ]?minister\b', 'Prime Minister', pos)
    pos = re.sub(r'(?i)\bpresident\b', 'President', pos)
    pos = re.sub(r'(?i)\bvice[- ]?president\b', 'Vice-President', pos)

    # Collapse primary roles if they appear at start
    primary_roles = [
        (r'(?i)^prime[- ]?minister\b', 'Prime Minister'),
        (r'(?i)^deputy prime[- ]?minister\b', 'Deputy Prime Minister'),
        (r'(?i)^president\b', 'President'),
        (r'(?i)^vice[- ]?president\b', 'Vice-President'),
        (r'(?i)^head of state\b', 'Head of State'),
        (r'(?i)^head of government\b', 'Head of Government'),
        (r'(?i)^(crown prince|prince|king|emir|amir)\b', 'Monarch'),
        (r'(?i)^(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)\b', 'Diplomatic Representative')
    ]
    for pattern, replacement in primary_roles:
        if re.match(pattern, pos):
            return replacement

    if re.search(r'(?i)not indicated', pos):
        return np.nan

    # Everything else becomes "Others"
    print("Unmatched position:", pos) 
    return "Others"


df_merged["position"] = df_merged["position"].apply(normalize_position)

[nan 'Prince' 'Chairman of the Council of Ministers'
 "Chairman of the Council of Ministers of the People's Republic of Albania"
 'Prime Minister' 'President' 'Emperor' 'King'
 'Chairman of the Council of Ministers of the Union of Soviet Socialist Republics'
 "Vice-President of the Prime Minister's Council" 'President ' ' '
 'Prime Minister   ' 'Chancellor'
 'Head of the Federal Military Government '
 'Prime Minister and Minister for foreign affairs '
 "First Secretary of the Central Committee of the Polish United Workers' Party"
 'Chairman of the United National Front of Kampuchea French'
 'Personal Representative of the Head of State of the Republic of the Philippines'
 'Minister of Foreign Affairs'
 'Prime Minister and Minister for Foreign Affairs' 'President Republic'
 'Prime Minister, Minister for Defence and Foreign Affairs'
 'Prime Minister and Minister for General and Foreign Affairs '
 'Prime Minister and Minister for General and '
 'Prime Minister and Minister for External Af

In [50]:
# Standardize position titles by merging deputy, second, and vice roles into their corresponding official positions for consistent categorization

def merge_positions(pos):
    if pd.isna(pos):
        return pos 
    
    # Merge Prime Minister roles
    if pos in ["Prime Minister", "Deputy Prime Minister"]:
        return "(Deputy) Prime Minister"
    
    # Merge President roles
    if pos in ["President", "Vice-President"]:
        return "(Vice-) President"
    
    # Move these roles to Others
    if pos in ["Head of State", "Head of Government", "Monarch"]:
        return "Others"
        
    if pos in ["Minister for Foreign Affairs", "Deputy Minister for Foreign Affairs",
        "Deputy Minister Foreign Affairs",
        "Second Minister for Foreign Affairs",
        "Second Minister for Foreign Affairs and Trade",
        "Vice Minister for Foreign Affairs"]:
        return "(Deputy) Minister for Foreign Affairs"
    
    return pos

df_merged["position"] = df_merged["position"].apply(merge_positions)

In [51]:
pd.set_option("display.max_rows", None)
position_counts = df_merged['position'].value_counts(dropna=False)
print(position_counts)

position_percentages = df_merged['position'].value_counts(normalize=True, dropna=False) * 100
print("\nPercentages:\n", position_percentages)

position
NaN                                      4679
(Deputy) Minister for Foreign Affairs    2387
(Vice-) President                        2060
(Deputy) Prime Minister                  1239
Diplomatic Representative                 339
Others                                    248
Name: count, dtype: int64

Percentages:
 position
NaN                                      42.722790
(Deputy) Minister for Foreign Affairs    21.795106
(Vice-) President                        18.809350
(Deputy) Prime Minister                  11.313002
Diplomatic Representative                 3.095325
Others                                    2.264427
Name: proportion, dtype: float64


### New Variable: Country (Year)

This variable is later needed to create clean description plots and tables

In [53]:
df_merged = df_merged.copy()
df_merged['speech_label'] = df_merged['country_name'] + " (" + df_merged['year'].astype(str) + ")"

### Save dataframe with all new variables as un_corpus_merged

In [55]:
os.chdir(wd)

merged_pickle_path = r".\data\un_corpus_merged.pkl"
df_merged.to_pickle(merged_pickle_path)
 
merged_output_path = r".\data\un_corpus_merged.csv"
df_merged.to_csv(merged_output_path, index=False, sep=';', encoding='utf-8')

___

## Cleaning & Pre-processing

#### Cleaning

In [59]:
# == Clean corpus by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

def cleaning(content):
    if pd.isna(content):
        return ""
    
    content = content.replace('\n', ' ').replace('\r', ' ')
    content = ' '.join(content.split())

    # Ensure spacing after punctuation
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" → "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)

    content = content.replace("\\", "")

    return content

df_merged['speech'] = df_merged['speech'].astype(str)  # Ensure column is string type
df_clean = df_merged.copy()
df_clean['speech'] = df_clean['speech'].apply(cleaning)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)

In [60]:
# == Split cleaned data into chunks and save as separate files ==

clean_data = df_clean[['filename', 'speech']].values.tolist()

data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

os.chdir(data_temp)

joblib.dump(data_id1, 'clean_speeches_indexed1.pkl')
joblib.dump(data_id2, 'clean_speeches_indexed2.pkl')
joblib.dump(data_id3, 'clean_speeches_indexed3.pkl')
joblib.dump(data_id4, 'clean_speeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
clean_files = [
    os.path.join(data_temp, 'clean_speeches_indexed1.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed2.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed3.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed4.pkl')
]

print(f"Saved clean speeches chunks in '{data_temp}'")

Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Text Pre-Processing

In [62]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


In [63]:
stopwords_path = os.path.join(data_c, "stopwords.pkl")
procedural_words_path = os.path.join(data_c, "procedural_words.pkl")

# Merge stopwords with procedural words
stopwords = joblib.load(stopwords_path)
procedural_words = joblib.load(procedural_words_path)
print(f"Number of stopwords before merging: {len(stopwords)}")
print(f"Number of procedural words before merging: {len(procedural_words)}")
stopwords = set(stopwords) | set(procedural_words) 

print(f"Loaded {len(stopwords)} stopwords")
print(list(stopwords)[:100])

Number of stopwords before merging: 18997
Number of procedural words before merging: 2884
Loaded 21217 stopwords
['tebbett', 'sena', 'ebensburg', 'palestin', 'pecatonica', 'maribel', 'cr', 'altern', 'repeat', 'ankeni', 'everi', 'southington', 'rockhil', 'fortyfour', 'ponder', 'westlak', 'pulaski', 'brinkhaven', 'myersvill', 'dclxlv', 'rosenth', 'osawatomi', 'tarkio', 'medinah', 'offici', 'dcdxxxv', 'panetta', 'cxvii', 'dcccxx', 'substanti', 'robard', 'montandon', 'see', 'gartrel', 'protract', 'lyndhurst', 'bateman', 'kilpatrick', 'mattawana', 'farrow', 'colliersvill', 'brushton', 'saugerti', 'anmoor', 'twenty-third', 'kitchin', 'hutton', 'barwick', 'unicoi', 'ella', 'ellwood', 'plenti', 'mccomb', 'brandon', 'oakwood', 'turn', 'manteo', 'eshleman', 'arkport', 'crandal', 'higganum', 'tunkhannock', 'blossom', 'startup', 'klink', 'glenbeulah', 'suple', 'bright', 'perkiomenvill', 'reynoldsvill', 'murrayvill', 'pelham', 'lafourch', 'through', 'barri', 'lxlix', 'dccliii', 'hilda', 'camak', 'r

In [64]:
# == Functions to remove punctioation, tokenize, lowercase, pure digit tokens, words shorter than 2 letters, POS-Tagging, stemm, stopword removal ==

def pro1(lista):
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
        a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
        return a
    
def pro4(lista):
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]


def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]  # tag each tokenlist
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t
    
def pro5(lista):
    return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
    
def pro6(lista):
    return [[row[0], [w for w in row[1] if w not in stopwords]] for row in lista]

def dropnull(lista):
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [65]:
# == Create full pre-processing function and run it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
    data = pro6(data)
    
    data = dropnull(data)

    filename_preprocessed = data_name.replace('clean_speeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in clean_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Before tagging: 35.68s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] After tagging: 688.56s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Done. Total time: 872.73s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] Before tagging: 29.56s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] After tagging: 604.05s
[C:\Us

In [66]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

In [67]:
# Load all preprocessed pickle files
preprocessed_data = []
for f in preprocessed_files:
    preprocessed_data.extend(joblib.load(f))

# Turn into DataFrame
df_preprocessed = pd.DataFrame(preprocessed_data, columns=["filename", "speech_preprocessed"])

# Merge into df_merged
df_merged = df_merged.merge(df_preprocessed, on="filename", how="left")

print(df_merged.head())


          filename                                             speech  \
0  ARG_01_1946.txt  At the resumption of the first session of the ...   
1  AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2  BEL_01_1946.txt  The\tprincipal organs of the United Nations ha...   
3  BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4  BOL_01_1946.txt  Coming to this platform where so many distingu...   

  country_code  year country_name  speech_length_words  \
0          ARG  1946    Argentina                 3364   
1          AUS  1946    Australia                 4531   
2          BEL  1946      Belgium                 2501   
3          BLR  1946      Belarus                 3055   
4          BOL  1946      Bolivia                 1501   

   english_official_language  security_council_permanent        speaker_name  \
0                          0                           0            Mr. Arce   
1                          0                        

In [68]:
# == New variable: Speech length of the preprocessed corpus ==

# Count tokens in preprocessed speech
df_merged["speech_length_preprocessed"] = df_merged["speech_preprocessed"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(df_merged[["filename", "speech_length_preprocessed"]].head())
all_tokens = [token for speech in df_merged["speech_preprocessed"].dropna() for token in speech]
unique_tokens = set(all_tokens)
print("Total unique tokens:", len(unique_tokens))

# Average length of preprocessed speeches
average_length = df_merged["speech_length_preprocessed"].mean()

print(f"Average number of tokens per speech: {average_length:.2f}")

          filename  speech_length_preprocessed
0  ARG_01_1946.txt                         354
1  AUS_01_1946.txt                         360
2  BEL_01_1946.txt                         270
3  BLR_01_1946.txt                         383
4  BOL_01_1946.txt                         146
Total unique tokens: 35009
Average number of tokens per speech: 410.95


In [69]:
print(df_merged.head())

          filename                                             speech  \
0  ARG_01_1946.txt  At the resumption of the first session of the ...   
1  AUS_01_1946.txt  The General Assembly of the United Nations is ...   
2  BEL_01_1946.txt  The\tprincipal organs of the United Nations ha...   
3  BLR_01_1946.txt  As more than a year has elapsed since the Unit...   
4  BOL_01_1946.txt  Coming to this platform where so many distingu...   

  country_code  year country_name  speech_length_words  \
0          ARG  1946    Argentina                 3364   
1          AUS  1946    Australia                 4531   
2          BEL  1946      Belgium                 2501   
3          BLR  1946      Belarus                 3055   
4          BOL  1946      Bolivia                 1501   

   english_official_language  security_council_permanent        speaker_name  \
0                          0                           0            Mr. Arce   
1                          0                        

In [70]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


---

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [74]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

#def remove_rare_words(filenames, freqs, min_count=10):
   # for fname in filenames:
       # data = joblib.load(fname)
       # filtered_data = []
        #for doc_id, tokens in data:
          #  filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
          #  filtered_data.append([doc_id, filtered_tokens])
       # joblib.dump(filtered_data, fname)  # overwrite or save as new file
       # print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts = count_frequencies(preprocessed_files)

#remove_rare_words(preprocessed_files, word_counts, min_count=10)

print("\n[Stemmed] Top 50 most common words:")
for word, count in word_counts.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'word_counts.pkl')
joblib.dump(word_counts, save_path)

100%|██████████| 4/4 [00:39<00:00,  9.78s/it]



[Stemmed] Top 50 most common words:
econom: 73475
human: 57178
problem: 56809
region: 48963
achiev: 39487
global: 36179
africa: 34787
nuclear: 32246
solut: 32199
social: 30684
charter: 28158
african: 27745
weapon: 26744
contribut: 26390
respons: 26218
negoti: 25658
implement: 25093
cannot: 22646
ensur: 22450
area: 22357
disarma: 21892
increas: 21585
strengthen: 20530
promot: 20382
role: 19597
non: 19390
decis: 18832
propos: 18690
climat: 17991
threat: 17597
goal: 17426
crisi: 17155
terror: 16682
stabil: 16576
struggl: 14857
aggress: 14819
toward: 14590
palestinian: 14093
soviet: 13992
financi: 13807
poverti: 13372
europ: 13173
democraci: 13087
share: 12802
purpos: 12612
suffer: 12543
dialogu: 12385
european: 12324
popul: 12253
regim: 12026

[Stemmed] Top 50 least common words:
aysenur: 1
ezgi: 1
cencourag: 1
salway: 1
shshout: 1
montremontreux: 1
navinavig: 1
wbecam: 1
mmadam: 1
aattack: 1
paa: 1
desexu: 1
nhuman: 1
inshallah: 1
ffnpt: 1
fakafetai: 1
lasi: 1
bbt: 1
highemiss: 1
nabban

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts.pkl']

In [75]:
num_unique_words = len(word_counts)
print(f"Number of unique words: {num_unique_words}")

Number of unique words: 35009


In [76]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [78]:
affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

# == Count dictionary words

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

a_list = [[i, word_counts[i]] for i in affect if i in word_counts]
c_list = [[i, word_counts[i]] for i in cognition if i in word_counts]

a_list = sorted(a_list, key=lambda x: x[1], reverse=True)
c_list = sorted(c_list, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a_list]
c = [[i[0], f"({i[1]}),"] for i in c_list]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

# number of affect/cognitive words that appear in word_counts
num_affect_words = len(a_list)
num_cog_words = len(c_list)

# Dictionary words that appear less than 10 times
num_affect_lt10 = sum(1 for _, count in a_list if count < 10)
num_cog_lt10 = sum(1 for _, count in c_list if count < 10)

print(f"Unique affect words in text: {num_affect_words}")
print(f"Unique cognition words in text: {num_cog_words}")
print(f"Affect words with count < 10: {num_affect_lt10}")
print(f"Cognition words with count < 10: {num_cog_lt10}")


# == Calculate weighted frequencies for all words

l = sum(word_counts.values())

a = 0.001 # Method to downweight with a smoothing parameter: For frequent words (large v/1), weight approaches 0; for rare words (small v/1) closer to 1
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

# To print top 50 by weighted values, sort the dictionary by value descending:
top_50_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:50]

print("Top 50 words by weighted frequency:")
for word, weight in top_50_weighted:
    print(f"{word}: {weight}")

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

---

## Final Cleaning

In [81]:
os.chdir(data_freq)

word_counts = joblib.load('word_counts.pkl')  # load stemmed counts
# For each speech only keep tokens that appear at least 10x

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if word_counts.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    cleaned_path = data_path.replace('.pkl', '_final.pkl')
    joblib.dump(data, cleaned_path)

In [82]:
os.chdir(data_preprocessed)

final_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl')
]

final_data = []
for fname in final_files:
    final_data.extend(joblib.load(fname))

# Merge with df_merged
df_final = pd.DataFrame(final_data, columns=["filename", "speech_final"])
df_merged = df_merged.merge(df_final, on="filename", how="left")

# Create speech_length_final column
df_merged["speech_length_final"] = df_merged["speech_final"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(df_merged[["filename", "speech_length_final"]].head())

all_tokens_final = [token for speech in df_merged["speech_final"].dropna() for token in speech]
unique_tokens_final = set(all_tokens_final)
print("Total unique tokens across all final speeches:", len(unique_tokens_final))

print("Average tokens per final speech:", df_merged["speech_length_final"].mean())

# Save as pickle
joblib.dump(df_merged, os.path.join(data_c, "un_corpus_merged.pkl"))

# Save as CSV
df_merged.to_csv(
    os.path.join(data_c, "un_corpus_merged.csv"),
    sep=';',
    index=False,
    encoding='utf-8'
)

          filename  speech_length_final
0  ARG_01_1946.txt                  339
1  AUS_01_1946.txt                  360
2  BEL_01_1946.txt                  269
3  BLR_01_1946.txt                  380
4  BOL_01_1946.txt                  142
Total unique tokens across all final speeches: 9473
Average tokens per final speech: 405.87783053323597
