# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 0: Data Cleaning, Preprocessing & Token Frequencies
### Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- Set your working directory appropriately.  
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the folder "UNGDC_1946_2024.tar.gz" from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Create new variables
    - year, country_code, country_name
    - speech_length_words
    - english_official_language
    - security_council_permanent
    - gender
    - position
    - speaker_name
    - Country (Year)
- Cleaning
    - remove line breaks, hypehnation etc.
- Preprocessing
    - remove punctuation, tokenize, lowercase, pure digit tokens, words shorter than 2 letters, POS-Tagging, stemm, stopword removal
    - create new variable: speech_length_preprocessd
- Word Frequencies
    - word counts of the preprocessed_corpus
    - count frequency of the dictionary words
    - calculate weighted frequency
- Final preprocessing (Not correct, I think)
    - Remove words that appear less than 10x times from the preprocessed corpus 

___

## Setup, Installation of required Packages and Libraries & Folder Structure

In [4]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "matplotlib",
        "tqdm",
        "seaborn",
        "joblib",
        "scipy",
        "tabulate",
        "rapidfuzz",
        "tableone"
        
    ]

    for package in packages:
        if importlib.util.find_spec(package) is None:
            !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg') 

In [5]:
# == Import libraries for data processing and NLP ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path
from rapidfuzz import process, fuzz

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [6]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)

wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")
os.chdir(wd)

# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

# Create additional folders in the base directory
additional_folders = ["fig", "tables"]
for folder in additional_folders:
    (base_path / folder).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")
for folder in additional_folders:
    print(f"- {folder}")

# Prompt user to place raw data files
#print(f"\nPlease place your raw data files (unzipped) into the folder:\n  {data_path / 'data_original'}")
#input("Press Enter after you have placed the files to continue...")

#print("Continuing with the script...")



Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
- fig
- tables


In [7]:
# === Define Folder Paths ===

# If an error occurs, make sure that you actually have these folders in your working directory

data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"
#fig_dir = wd /"fig"
#tables_dir = wd /"tables"

In [8]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

## Load and Prepare Corpus

In [11]:
# == Load the txt-files from the UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\TXT"

# Collect txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ################################################# REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,10953)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

df_raw = pd.DataFrame(raw_data)

# Include only valid filenames
df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

# Drop empty speeches
df_raw['speech'] = df_raw['speech'].astype(str)
df_raw = df_raw[df_raw['speech'].str.strip() != ''].copy()

# Check for duplicates
dupe_labels = df_raw[df_raw.duplicated(subset=['filename', 'speech'], keep=False)]
print(dupe_labels[['filename', 'speech']].head(20))

# == Store as csv and pkl ==

raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


Total speeches found: 10953
Empty DataFrame
Columns: [filename, speech]
Index: []

 Saved raw data with 10952 speeches to '.\data\un_corpus_raw.csv'


## Is this necessary?

In [13]:
# == Load data ==

df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")
df_raw.head()         

Unnamed: 0,filename,speech
0,BFA_74_2019.txt,As Africa’s candidate for the position of Pres...
1,VEN_46_1991.txt,"﻿Mr. President, I have come before the represe..."
2,MDG_51_1996.txt,"﻿In common with the speakers preceding me, I\n..."
3,AGO_48_1993.txt,"First of all, Sir,\non behalf of the Governmen..."
4,BEN_49_1994.txt,"I\nshould like first of all, Mr. President, to..."


___

## Create new variables

#### New Variables: Year, Country Code and Country Name

In [17]:
# == Create variable: country code & year ==

# Create contry_code and year variable
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())
# Speeches range from 1946 to 2023

# == Create variable: country_name by matching ISO country code 
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "Democratic Republic of Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "South Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
}

code_to_name.update(custom_names)
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

Min year: 1946
Max year: 2024
Missing codes: []


In [18]:
# == Check country names and structure ==

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
pd.reset_option('display.max_rows')

    country_code                      country_name
0            AFG                       Afghanistan
1            AGO                            Angola
2            ALB                           Albania
3            AND                           Andorra
4            ARE              United Arab Emirates
5            ARG                         Argentina
6            ARM                           Armenia
7            ATG               Antigua and Barbuda
8            AUS                         Australia
9            AUT                           Austria
10           AZE                        Azerbaijan
11           BDI                           Burundi
12           BEL                           Belgium
13           BEN                             Benin
14           BFA                      Burkina Faso
15           BGD                        Bangladesh
16           BGR                          Bulgaria
17           BHR                           Bahrain
18           BHS               

#### New Variable: Length of speeches

In [20]:
# Count total number of unique tokens in the corpus
all_tokens = set()
for speech in df_raw['speech']:
    all_tokens.update(str(speech).split())
print("Total number of unique tokens in the corpus:", len(all_tokens))

# Count total number of tokens in the corpus
total_tokens = df_raw['speech'].apply(lambda x: len(str(x).split())).sum()
print("Total number of tokens in the corpus:", total_tokens)

# Add a new column: speech length in words
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(str(x).split()))

# Calculate average length
avg_length = df_raw['speech_length_words'].mean()
print("Average speech length (words):", round(avg_length, 2))

# 20 shortest & longest speeches
print("20 shortest speeches:")
print(df_raw.nsmallest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

print("\n20 longest speeches:")
print(df_raw.nlargest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

Total number of unique tokens in the corpus: 225938
Total number of tokens in the corpus: 31911386
Average speech length (words): 2913.75
20 shortest speeches:
              filename                      country_name  year  \
5229   EGY_28_1973.txt                             Egypt  1973   
6407   VCT_41_1986.txt  Saint Vincent and the Grenadines  1986   
1333   BEN_71_2016.txt                             Benin  2016   
5711   GNB_76_2021.txt                     Guinea-Bissau  2021   
3306   HTI_01_1946.txt                             Haiti  1946   
8505   RWA_76_2021.txt                            Rwanda  2021   
511    DDR_45_1990.txt                      East Germany  1990   
5172   LTU_73_2018.txt                         Lithuania  2018   
6739   RWA_69_2014.txt                            Rwanda  2014   
7137   IRN_01_1946.txt                              Iran  1946   
8184   LTU_72_2017.txt                         Lithuania  2017   
172    RWA_70_2015.txt                          

#### New variable: English as Official Language

In [22]:
# Source for english as official language : https://gradschool.utk.edu/future-students/office-of-graduate-admissions/applying-to-graduate-school/admission-requirements/testing-requirements/countries-with-english-as-official-language/
# They are quoting: https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

# Create dummy column for english being the official language
df_raw['english_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

# Detect unmatched countries 
matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members

# Check df with new variable english_official_language
df_raw.head()


Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
Turks and Caicos Islands


Unnamed: 0,filename,speech,country_code,year,country_name,speech_length_words,english_official_language
0,BFA_74_2019.txt,As Africa’s candidate for the position of Pres...,BFA,2019,Burkina Faso,2788,0
1,VEN_46_1991.txt,"﻿Mr. President, I have come before the represe...",VEN,1991,Venezuela,2736,0
2,MDG_51_1996.txt,"﻿In common with the speakers preceding me, I\n...",MDG,1996,Madagascar,1997,0
3,AGO_48_1993.txt,"First of all, Sir,\non behalf of the Governmen...",AGO,1993,Angola,3676,0
4,BEN_49_1994.txt,"I\nshould like first of all, Mr. President, to...",BEN,1994,Benin,3856,0


#### New variable: Permanent member security council

In [24]:
# Define permanent members of the UN Security Council and create dummy
permanent_members = ['RUS', 'USA', 'FRA', 'GBR', 'CHN']

df_raw['security_council_permanent'] = df_raw['country_code'].isin(permanent_members).astype(int)

print(df_raw[df_raw['country_code'].isin(permanent_members)][
    ['country_code', 'country_name', 'security_council_permanent', 'year']
])

      country_code   country_name  security_council_permanent  year
36             FRA         France                           1  1979
46             USA  United States                           1  2003
48             USA  United States                           1  1964
49             FRA         France                           1  2007
74             CHN          China                           1  2019
...            ...            ...                         ...   ...
10793          FRA         France                           1  2013
10824          USA  United States                           1  1952
10845          USA  United States                           1  1965
10885          FRA         France                           1  2012
10924          RUS         Russia                           1  1951

[388 rows x 4 columns]


#### New variables: Speaker, Position & Gender

In [26]:
# Load suplementary data set which contains information on the speakers name (sometimes including gender) and their position
df_speakers = pd.read_excel(os.path.join(data_c, "data_original", "Speakers_by_session.xlsx"))

# Check uniqueness of keys in df_speakers
print(df_speakers.duplicated(subset=['Year', 'ISO Code']).sum())

# Check for duplicates in df_speakers
dupes_speakers = df_speakers[df_speakers.duplicated(subset=['Year', 'ISO Code'], keep=False)]
print(dupes_speakers.sort_values(['Year', 'ISO Code']).head(20))

# For two observations the noted speakers differ, therefore an additional UN Resource was used to determine the real speaker
# for 1958 Iraq Mr. Jomard see https://digitallibrary.un.org/record/380721
# for 1954 Phillipines Mr. Romulo see https://digitallibrary.un.org/record/380429

# Overwrite the wrongfully noted speaker for Iraq and the Philipines and afterwards drop each of the double observation
df_speakers_cleaned = (
    df_speakers[~(
        ((df_speakers['ISO Code'] == "IRQ") & (df_speakers['Year'] == 1958) & (df_speakers['Name of Person Speaking'] == "Mr. Jawad")) |
        ((df_speakers['ISO Code'] == "PHL") & (df_speakers['Year'] == 1954) & (df_speakers['Name of Person Speaking'] == "Mr. SERRANO"))
    )]
    .drop_duplicates(subset=['Year', 'ISO Code'], keep='first')
)

# Check if there are duplicates that are still unaccounted for
print(df_speakers_cleaned.duplicated(subset=['Year', 'ISO Code']).sum())

17
       Year  Session ISO Code                              Country  \
10699  1951        6      RUS  Union of Soviet Socialist Republics   
10745  1951        6      RUS  Union of Soviet Socialist Republics   
10574  1954        9      PHL                          Philippines   
10608  1954        9      PHL                          Philippines   
10454  1956       11      IRQ                                 Iraq   
10517  1956       11      IRQ                                 Iraq   
10516  1956       11      SYR                                Syria   
10519  1956       11      SYR                                Syria   
10398  1957       12      CSK                       Czechoslovakia   
10436  1957       12      CSK                       Czechoslovakia   
10352  1958       13      BGR                             Bulgaria   
10374  1958       13      BGR                             Bulgaria   
10320  1958       13      CSK                       Czechoslovakia   
10376  1958      

In [27]:
# == Create variable speaker_name and position ==

# Merge new infrormation to dataframe
df_merged = df_raw.merge(
    df_speakers_cleaned[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

# Detect unmatched rows
unmatched = df_merged[df_merged['_merge'] == 'left_only']
unmatched_count = (df_merged['_merge'] == 'left_only').sum()

print(unmatched[['filename', 'year', 'country_code', 'country_name']])
print(f"{unmatched_count} rows could not be matched")

# Check if any duplicates were created
dupes_speakers = df_merged[df_merged.duplicated(subset=['year', 'country_code'], keep=False)]
print(dupes_speakers.sort_values(['year', 'country_code']).head(20))

# Clean up: 
#- Keep all rows, unmateched rows are being set to NA
#- Drop redundant columns and rename some columns
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge']).rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'})

# == Create gender dummy ==

df_merged['gender_dummy'] = df_merged['speaker_name'].apply(
    lambda name: 0 if pd.notnull(name) and re.search(r'^(?:Mr|Sir)\b', name, re.IGNORECASE)
    else 1 if pd.notnull(name) and re.search(r'^(?:Mrs|Ms)\b', name, re.IGNORECASE)
    else None)

# Count all values including NaN
counts = df_merged['gender_dummy'].value_counts(dropna=False)

# Build summary using .get() to handle missing keys
gender_summary = pd.DataFrame({
    'gender_dummy': ['0 (male)', '1 (female)', 'NaN (unknown)'],
    'count': [
        counts.get(0, 0),
        counts.get(1, 0),
        counts.get(np.nan, 0)
    ]
})

print(gender_summary)

              filename  year country_code    country_name
419    GIN_34_1979.txt  1979          GIN          Guinea
431    YMD_28_1973.txt  1973          YMD     South Yemen
561    YMD_23_1968.txt  1968          YMD     South Yemen
647    KEN_49_1994.txt  1994          KEN           Kenya
670    AUS_49_1994.txt  1994          AUS       Australia
...                ...   ...          ...             ...
10118  CAN_21_1966.txt  1966          CAN          Canada
10438  YMD_35_1980.txt  1980          YMD     South Yemen
10646  DNK_24_1969.txt  1969          DNK         Denmark
10712  CHN_18_1963.txt  1963          CHN           China
10897   EU_68_2013.txt  2013           EU  European Union

[84 rows x 4 columns]
84 rows could not be matched
Empty DataFrame
Columns: [filename, speech, country_code, year, country_name, speech_length_words, english_official_language, security_council_permanent, Year, ISO Code, Name of Person Speaking, Post, _merge]
Index: []
    gender_dummy  count
0       0

In [42]:
# == Adjust position variable ==

print(df_merged['position'].unique())
# For speakers that have more than one position it appears that the higher position is always listed first, therefore the second position will be dropped in the following function

# Since there are many expression for the position variable and it dos not seem to be unified, the position variable is adjusted to enable more consitency

def normalize_position(pos):
    pos = row["position"]
    country
    if pd.isna(pos):
        return pos

    pos = pos.strip()

    # Fix common typos and extra spaces
    pos = re.sub(r'\s+', ' ', pos) 
    pos_lower = pos.lower()

     # Turn all ministers that deal with foreign affairs and international relations to "Minister for Foreign Affairs"
    foreign_affairs_variants = [
        'minister for foregn affairs',
        'minister responsible for foreign affairs',
        'minsiter for foreign and caricom affairs',
        'minister for external affairs',
        'minister of external relations',  # <-- added
        'foreign minister',
        'minister for international affairs and cooperation',
        'minister for external relations',
        'federal minister for european and international affairs',
        'international cooperation',
        'federal minister for foreign affairs',
        'minister for foreign and caricom affairs',
        'minister of foreign affairs and cooperation',
        'minister for international relations and cooperation',
        'ministry of external relations',
        'acting minister for foreign affairs and international cooperation',
        'ministry of foreign affairs',
        'minister for foreign and political affairs',
        'federal minister for europe, integration, and foreign affairs',
        'federal minister for europe, integration and foreign affairs',
        'minister of foreign and european affaris',
        'minister of foreign affairs',
        'minister for foreign',
        'minister of foreign and european affairs and minister of immigration and asylum',
        'minister for foreign affairs and senegalese living abroad',
        'minister for foreign affairs with responsibility for brexit',
        'minister for foreign affairs and investment promotion'
       
    ]
    if any(variant in pos_lower for variant in foreign_affairs_variants):
        return "Minister for Foreign Affairs"

    # Fix "rime minister" typo
    pos = re.sub(r'(?i)\brime[- ]?minister\b', 'Prime Minister', pos)

    # Normalize different versions of Head of Government, President, Prime Minsiter and Vice-President-
    exact_matches = {
        r'(?i)^president of (the )?government$': 'Head of Government',
        r'(?i)^acting president$': 'President',
        r'(?i)^interim president$': 'President',
        r'(?i)^constitutional president$': 'President',
        r'(?i)^first executive president$': 'President',
        r'(?i)^first prime[- ]?minister$': 'Prime Minister',
        r'(?i)^head of the goverment$': 'Head of Government',  # <-- catch typo + spaces
        r'(?i)^head\s+of\s+govern?ment$': 'Head of Government',
        r'(?i)^first vice[- ]?president$': 'Vice-President'
    }
    for pattern, replacement in exact_matches.items():
        if re.fullmatch(pattern, pos):
            return replacement

    # Normalize prefixes
    pos = re.sub(r'(?i)^first vice[- ]?president\b', 'Vice-President', pos)
    pos = re.sub(r'(?i)\bprime[- ]?minister\b', 'Prime Minister', pos)
    pos = re.sub(r'(?i)\bpresident\b', 'President', pos)
    pos = re.sub(r'(?i)\bvice[- ]?president\b', 'Vice-President', pos)

    # Print observations for Monarchs, Heads of State, and Heads of Government
    if re.search(r'(?i)\b(crown prince|prince|king|emir|amir|sheikh)\b', pos):
        print(f"Monarch detected: {row['country_name']}, {row['year']}")
    elif re.search(r'(?i)^head of state\b', pos):
        print(f"Head of State detected: {row['country_name']}, {row['year']}")
    elif re.search(r'(?i)^head of gover?ment\b', pos):
        print(f"Head of Government detected: {row['country_name']}, {row['year']}")


    # Collapse primary roles if they appear at start
    primary_roles = [
        (r'(?i)^prime[- ]?minister\b', 'Prime Minister'),
        (r'(?i)^deputy prime[- ]?minister\b', 'Deputy Prime Minister'),
        (r'(?i)^president\b', 'President'),
        (r'(?i)^vice[- ]?president\b', 'Vice-President'),
        (r'(?i)^head of state\b', 'Head of State'),
        (r'(?i)^head of government\b', 'Head of Government'),
        (r'(?i)^(crown prince|prince|king|emir|amir)\b', 'Monarch'),
        (r'(?i)^(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)\b', 'Diplomatic Representative')
    ]
    for pattern, replacement in primary_roles:
        if re.match(pattern, pos):
            return replacement

    # Monarchs
    #if re.search(r'(?i)\b(crown prince|prince|king|emir|amir)\b', pos):
     #   return "Monarch"

    # Head of State
    #if re.search(r'(?i)head of state', pos):
     #   return "Head of State"
        
    # Diplomatic Representatives
    #if re.search(r'(?i)(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)', pos):
     #   return "Diplomatic Representative"

    # Not indicated
    #if re.search(r'(?i)not indicated', pos):
     #   return np.nan

    # Everything that is leftover becomes Others
    print("Unmatched position:", pos) 
    return "Others"


df_merged["position"] = df_merged["position"].apply(normalize_position)

['President of the Council of Ministers' 'President'
 'Minister for Foreign Affairs' nan 'President '
 'Commander in Chief, Head of State' 'Prime Minister'
 'Deputy Prime Minister' 'UN Representative' 'Prime minister'
 'Prime Minister and Minister for Reform' 'Head of Government'
 'Minister for Foreign Affairs and Minister for Intelligence'
 'State Councillor and Minister for Foreign Affairs' 'Vice-President'
 'Minister of Foreign Affairs '
 'Minister for Foreign Affairs, Justice and Culture'
 'Minister for Foreign Affairs and Cooperation of Congolese Living Abroad'
 'Minister for Foreign and Political Affairs and Justice'
 'Permanent Representative' 'Constitutional President'
 'President and Commander-in-Chief of the Defence Forces' 'Vice President'
 'President of the Government'
 'Minister of Foreign and European Affairs and Minister of Immigration and Asylum'
 'Prime Minister and Minister for Foreign Affairs'
 "Vice-President of the Prime Minister's Council"
 'Minister for Foreign A

NameError: name 'row' is not defined

In [225]:
# Standardize position titles by merging deputy, second, and vice roles into their corresponding official positions for consistent categorization

def merge_positions(pos):
    if pd.isna(pos):
        return pos 
    
    # Merge Prime Minister roles
    if pos in ["Prime Minister", "Deputy Prime Minister"]:
        return "(Deputy) Prime Minister"
    
    # Merge President roles
    if pos in ["President", "Vice-President"]:
        return "(Vice-) President"
    
    # Move these roles to Others
    if pos in ["Head of State", "Head of Government", "Monarch"]:
        return "Others"
        
    if pos in ["Minister for Foreign Affairs", "Deputy Minister for Foreign Affairs",
        "Deputy Minister Foreign Affairs",
        "Second Minister for Foreign Affairs",
        "Second Minister for Foreign Affairs and Trade",
        "Vice Minister for Foreign Affairs"]:
        return "(Deputy) Minister for Foreign Affairs"
    
    return pos

df_merged["position"] = df_merged["position"].apply(merge_positions)

In [226]:
pd.set_option("display.max_rows", None)
position_counts = df_merged['position'].value_counts(dropna=False)
print(position_counts)

NaN                                      4679
(Deputy) Minister for Foreign Affairs    2387
(Vice-) President                        2060
(Deputy) Prime Minister                  1239
Diplomatic Representative                 340
Others                                    247
Name: position, dtype: int64


### New Variable: Country (Year)

This variable is later needed to create clean description plots and tables

In [228]:
df_merged = df_merged.copy()
df_merged['speech_label'] = df_merged['country_name'] + " (" + df_merged['year'].astype(str) + ")"

### Save dataframe with all new variables as un_corpus_merged

In [230]:
os.chdir(wd)

merged_pickle_path = r".\data\un_corpus_merged.pkl"
df_merged.to_pickle(merged_pickle_path)
 
merged_output_path = r".\data\un_corpus_merged.csv"
df_merged.to_csv(merged_output_path, index=False, sep=';', encoding='utf-8')

___

## Cleaning & Pre-processing

#### Cleaning

In [234]:
# == Clean corpus by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

def cleaning(content):
    if pd.isna(content):
        return ""
    
    content = content.replace('\n', ' ').replace('\r', ' ')
    content = ' '.join(content.split())

    # Ensure spacing after punctuation
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" → "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)

    content = content.replace("\\", "")

    return content

df_merged['speech'] = df_merged['speech'].astype(str)  # Ensure column is string type
df_clean = df_merged.copy()
df_clean['speech'] = df_clean['speech'].apply(cleaning)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [235]:
# == Split cleaned data into chunks and save as separate files ==

clean_data = df_clean[['filename', 'speech']].values.tolist()

data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

os.chdir(data_temp)

joblib.dump(data_id1, 'clean_speeches_indexed1.pkl')
joblib.dump(data_id2, 'clean_speeches_indexed2.pkl')
joblib.dump(data_id3, 'clean_speeches_indexed3.pkl')
joblib.dump(data_id4, 'clean_speeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
clean_files = [
    os.path.join(data_temp, 'clean_speeches_indexed1.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed2.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed3.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed4.pkl')
]

print(f"Saved clean speeches chunks in '{data_temp}'")

Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Text Pre-Processing

#### Extend stopwords list

In [238]:
#os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


In [239]:
# Full path to stopwords pickle
#stopwords_path = os.path.join(data_c, "stopwords.pkl")

# Load & sort stopwords
#stopwords = joblib.load(stopwords_path)
#stopwords = sorted(stopwords)

#print(f"Loaded {len(stopwords)} stopwords (sorted alphabetically)")
#print(stopwords[:1000])

In [240]:
# Step 1: Stem SpaCy stopwords and convert to set
stemmed_spacy = set(stemmer.stem(w) for w in SPACY_STOPWORDS)

exclude_words = {"please", "empti", "somehow", "anyhow", "somewher"}  # can add more
stemmed_spacy -= exclude_words

# Step 2: Your already-stemmed custom stopwords
my_stemmed_stopwords = {"year", "time", "member", "session", "work", "oper",
                        "nation", "south", "east", "countri", "afirca", "deleg",
                        "state", "peopl", "general", "organ", "assembl",
                        "way", "role", "present", "foreign", "presid"}

# Step 3: Merge sets and sort to get a list

STEMMED_STOPWORDS = sorted(stemmed_spacy.union(my_stemmed_stopwords))

# Step 4: Save
save_path = os.path.join(data_c, "spacy_stopwords_stemmed.pkl")
joblib.dump(STEMMED_STOPWORDS, save_path)

print(f"Saved {len(STEMMED_STOPWORDS)} stemmed stopwords to {save_path}")
print(STEMMED_STOPWORDS[:30])


Saved 315 stemmed stopwords to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\spacy_stopwords_stemmed.pkl
["'d", "'m", "'s", 'a', 'about', 'abov', 'across', 'afirca', 'after', 'afterward', 'again', 'against', 'all', 'almost', 'alon', 'along', 'alreadi', 'also', 'although', 'alway', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'ani', 'anoth', 'anyon', 'anyth']


In [241]:
# Get SpaCy stopwords
#SPACY_STOPWORDS = list(nlp.Defaults.stop_words)

# Stem each stopword and deduplicate
#STEMMED_STOPWORDS = sorted(set(stemmer.stem(w) for w in SPACY_STOPWORDS))

# Save as a pickle file inside your data_c folder
#save_path = os.path.join(data_c, "spacy_stopwords_stemmed.pkl")
#joblib.dump(STEMMED_STOPWORDS, save_path)

#print(f"Saved {len(STEMMED_STOPWORDS)} stemmed stopwords to {save_path}")
#print(STEMMED_STOPWORDS[:300])  # preview first 30


In [242]:
# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

# Check that none of the stopwords are part of the stemmed affect or cognition dictionary

In [243]:
# Beispiel: affect dictionary gegen gestemmte Stopwords prüfen
affect_stopwords = [w for w in affect if w in STEMMED_STOPWORDS]
cognition_stopwords = [w for w in cognition if w in STEMMED_STOPWORDS]

print(f"Affect dictionary contains {len(affect_stopwords)} stopwords: {affect_stopwords}")
print(f"Cognition dictionary contains {len(cognition_stopwords)} stopwords: {cognition_stopwords}")


Affect dictionary contains 1 stopwords: ['pleas']
Cognition dictionary contains 0 stopwords: []


In [244]:
# == Functions to remove punctioation, tokenize, lowercase, pure digit tokens, words shorter than 2 letters, POS-Tagging, stemm, stopword removal ==

def pro1(lista):
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
        a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
        return a
    
def pro4(lista):
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]


def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]  # tag each tokenlist
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t
    
def pro5(lista):
    return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
    
def pro6(lista):
    return [[row[0], [w for w in row[1] if w not in STEMMED_STOPWORDS]] for row in lista]
      
########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [245]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
    data = pro6(data)
    
    data = dropnull(data)

    filename_preprocessed = data_name.replace('clean_speeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in clean_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Before tagging: 14.19s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] After tagging: 218.21s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Done. Total time: 287.26s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] Before tagging: 13.72s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] After tagging: 220.26s
[C:\Us

In [246]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

In [247]:
# Load all preprocessed pickle files
preprocessed_data = []
for f in preprocessed_files:
    preprocessed_data.extend(joblib.load(f))

# Turn into DataFrame
df_preprocessed = pd.DataFrame(preprocessed_data, columns=["filename", "speech_preprocessed"])

# Merge into df_merged
df_merged = df_merged.merge(df_preprocessed, on="filename", how="left")

print(df_merged.head())


          filename                                             speech  \
0  HTI_70_2015.txt  Mr. President, I would like to express my warm...   
1  PRY_58_2003.txt  ﻿Two hundred years after the first cry of free...   
2  GMB_72_2017.txt  With warm greetings to all members of the Gene...   
3  SLV_04_1949.txt  Mr. Castro stated that the election of General...   
4  LBY_56_2001.txt  ﻿At the\noutset, I would like to congratulate ...   

  country_code  year country_name  speech_length_words  \
0          HTI  2015        Haiti                 1601   
1          PRY  2003     Paraguay                 1385   
2          GMB  2017       Gambia                 1696   
3          SLV  1949  El Salvador                 2211   
4          LBY  2001        Libya                 4110   

   english_official_language  security_council_permanent  \
0                          0                           0   
1                          0                           0   
2                          1    

In [248]:
# == New variable: Speech length of the preprocessed corpus ==

# Count tokens in preprocessed speech
df_merged["speech_length_preprocessed"] = df_merged["speech_preprocessed"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

# Quick check
print(df_merged[["filename", "speech_length_preprocessed"]].head())
all_tokens = [token for speech in df_merged["speech_preprocessed"].dropna() for token in speech]
unique_tokens = set(all_tokens)
print("Total unique tokens:", len(unique_tokens))

# Average length of preprocessed speeches
average_length = df_merged["speech_length_preprocessed"].mean()

print(f"Average number of tokens per speech: {average_length:.2f}")

          filename  speech_length_preprocessed
0  HTI_70_2015.txt                         692
1  PRY_58_2003.txt                         595
2  GMB_72_2017.txt                         769
3  SLV_04_1949.txt                         722
4  LBY_56_2001.txt                        1610
Total unique tokens: 40001
Average number of tokens per speech: 1177.18


In [249]:
print(df_merged.head())

          filename                                             speech  \
0  HTI_70_2015.txt  Mr. President, I would like to express my warm...   
1  PRY_58_2003.txt  ﻿Two hundred years after the first cry of free...   
2  GMB_72_2017.txt  With warm greetings to all members of the Gene...   
3  SLV_04_1949.txt  Mr. Castro stated that the election of General...   
4  LBY_56_2001.txt  ﻿At the\noutset, I would like to congratulate ...   

  country_code  year country_name  speech_length_words  \
0          HTI  2015        Haiti                 1601   
1          PRY  2003     Paraguay                 1385   
2          GMB  2017       Gambia                 1696   
3          SLV  1949  El Salvador                 2211   
4          LBY  2001        Libya                 4110   

   english_official_language  security_council_permanent  \
0                          0                           0   
1                          0                           0   
2                          1    

In [250]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


---

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [254]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

#def remove_rare_words(filenames, freqs, min_count=10):
   # for fname in filenames:
       # data = joblib.load(fname)
       # filtered_data = []
        #for doc_id, tokens in data:
          #  filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
          #  filtered_data.append([doc_id, filtered_tokens])
       # joblib.dump(filtered_data, fname)  # overwrite or save as new file
       # print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts = count_frequencies(preprocessed_files)

#remove_rare_words(preprocessed_files, word_counts, min_count=10)

print("\n[Stemmed] Top 300 most common words:")
for word, count in word_counts.most_common(300):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'word_counts.pkl')
joblib.dump(word_counts, save_path)

100%|██████████| 4/4 [00:19<00:00,  4.98s/it]



[Stemmed] Top 300 most common words:
unit: 187107
intern: 161305
develop: 147090
peac: 134933
world: 133091
secur: 86024
govern: 75551
econom: 73475
right: 67482
new: 59536
effort: 57570
human: 57178
problem: 56809
support: 55692
continu: 53696
communiti: 49388
region: 48963
polit: 48469
war: 41867
need: 41509
council: 41193
import: 40881
achiev: 39487
power: 38401
hope: 38377
conflict: 37632
situat: 36254
principl: 36246
global: 36179
resolut: 35258
africa: 34787
republ: 34661
forc: 34483
great: 34066
relat: 33847
order: 33512
concern: 33294
action: 32668
nuclear: 32246
solut: 32199
establish: 31713
confer: 31362
polici: 30730
commit: 30720
social: 30684
respect: 30371
effect: 30316
independ: 29254
chang: 28912
interest: 28268
charter: 28158
today: 27749
african: 27745
agreement: 27700
system: 27065
progress: 27042
weapon: 26744
end: 26438
contribut: 26390
respons: 26218
process: 26067
territori: 26013
issu: 25878
negoti: 25658
live: 25488
cooper: 25361
believ: 25098
implement: 25093

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts.pkl']

In [255]:
num_unique_words = len(word_counts)
print(f"Number of unique words: {num_unique_words}")

Number of unique words: 40001


In [256]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [258]:
# == Count dictionary words

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

a_list = [[i, word_counts[i]] for i in affect if i in word_counts]
c_list = [[i, word_counts[i]] for i in cognition if i in word_counts]

a_list = sorted(a_list, key=lambda x: x[1], reverse=True)
c_list = sorted(c_list, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a_list]
c = [[i[0], f"({i[1]}),"] for i in c_list]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

# number of affect/cognitive words that appear in word_counts
num_affect_words = len(a_list)
num_cog_words = len(c_list)

# out of those which appear less than 10 times
num_affect_lt10 = sum(1 for _, count in a_list if count < 10)
num_cog_lt10 = sum(1 for _, count in c_list if count < 10)

print(f"Unique affect words in text: {num_affect_words}")
print(f"Unique cognition words in text: {num_cog_words}")
print(f"Affect words with count < 10: {num_affect_lt10}")
print(f"Cognition words with count < 10: {num_cog_lt10}")


# == Calculate weighted frequencies for all words

l = sum(word_counts.values())

a = 0.001 # Method to downweight with a smoothing parameter: For frequent words (large v/1), weight approaches 0; for rare words (small v/1) closer to 1
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE ##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

---

## Final Cleaning

In [261]:
os.chdir(data_freq)

word_counts = joblib.load('word_counts.pkl')  # load stemmed counts
# For each speech only keep tokens that appear at least 10x

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if word_counts.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    cleaned_path = data_path.replace('.pkl', '_final.pkl')
    joblib.dump(data, cleaned_path)

In [262]:
os.chdir(data_preprocessed)

final_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl')
]

final_data = []
for fname in final_files:
    final_data.extend(joblib.load(fname))

# Merge with df_merged
df_final = pd.DataFrame(final_data, columns=["filename", "speech_final"])
df_merged = df_merged.merge(df_final, on="filename", how="left")

# Create speech_length_final column
df_merged["speech_length_final"] = df_merged["speech_final"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

# Quick check
print(df_merged[["filename", "speech_length_final"]].head())

all_tokens_final = [token for speech in df_merged["speech_final"].dropna() for token in speech]
unique_tokens_final = set(all_tokens_final)
print("Total unique tokens across all final speeches:", len(unique_tokens_final))

print("Average tokens per final speech:", df_merged["speech_length_final"].mean())


          filename  speech_length_final
0  HTI_70_2015.txt                  689
1  PRY_58_2003.txt                  593
2  GMB_72_2017.txt                  768
3  SLV_04_1949.txt                  722
4  LBY_56_2001.txt                 1593
Total unique tokens across all final speeches: 12500
Average tokens per final speech: 1171.5978816654492
