# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
## Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- Set your working directory appropriately.  
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the data from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Data Cleaning and Pre-Processing
- Count word frequencies and weight themh

___

## Setup, Installation and Verification of required Packages and Libraries

In [26]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "wordcloud",
        "matplotlib",
        "tqdm"
    ]

    for package in packages:
        !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

#########################
# Check if all packages are included
##########################

In [105]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")
tagger = nltk.perceptron.PerceptronTagger()

In [29]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp", "tokenized"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")

# Prompt user to place raw data files
#print(f"\nPlease place your raw data files (unzipped) into the folder:\n  {data_path / 'data_original'}")
#input("Press Enter after you have placed the files to continue...")

#print("Continuing with the script...")



Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
  - tokenized


In [30]:
# === Define Folder Paths ===

# If an error occurs, make sure that you actually have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
data_tokenized = os.path.join(data_c, 'tokenized')
fig = os.path.join(wd, 'fig')

In [35]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

## Load and Prepare Corpus

### This chunk can be skipped at the moment
### Think of proper header

In [40]:
# == Load Sample from UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

# Collect txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,761)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

df_raw = pd.DataFrame(raw_data)

# == Store as csv and pkl ==

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


Total speeches found: 10761

 Saved raw data with 761 speeches to '.\data\un_corpus_raw.csv'


In [41]:
# == Load data & drop empty speeches ==

df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# Drop empty speeches
df_raw['speech'] = df_raw['speech'].astype(str)
df_raw = df_raw[df_raw['speech'].str.strip() != ''].copy()

df_raw.head()         


Unnamed: 0,filename,speech
0,THA_67_2012.txt,﻿On behalf of the\ndelegation of the Kingdom o...
1,EGY_02_1947.txt,"In the name of my delegation, permit me to ass..."
2,IRQ_62_2007.txt,I am honoured to address the General Assembly...
3,IRN_10_1955.txt,104. May I be permitted to convey to Mr. Maza ...
4,VNM_71_2016.txt,I congratulate Mr. Peter Thomson on his electi...


In [42]:
dupe_labels = df_raw[df_raw['filename'].duplicated(keep=False)]
print(dupe_labels[['filename', 'speech']].head(20))

Empty DataFrame
Columns: [filename, speech]
Index: []


___

## Create new variables

#### New Variables: Year, Country Code and Country Name

In [46]:
# == Create variable: country code & year

# Create contry_code and year variable
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())
# Speeches range from 1946 to 2023

# == Create variable: country_name by matching ISO country code 
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "Democratic Republic of Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "South Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
}

code_to_name.update(custom_names)
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

Min year: 1946
Max year: 2023
Missing codes: []


In [47]:
# == Check country names and structure

df_raw.head() 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                      country_name
0            AFG                       Afghanistan
1            AGO                            Angola
2            ALB                           Albania
3            AND                           Andorra
4            ARE              United Arab Emirates
5            ARG                         Argentina
6            ARM                           Armenia
7            ATG               Antigua and Barbuda
8            AUS                         Australia
9            AUT                           Austria
10           BDI                           Burundi
11           BEL                           Belgium
12           BEN                             Benin
13           BFA                      Burkina Faso
14           BGD                        Bangladesh
15           BGR                          Bulgaria
16           BHR                           Bahrain
17           BHS                           Bahamas
18           BIH            Bos

#### New Variable: Length of speeches

In [49]:
all_tokens = set()
for speech in df_raw['speech']:
    all_tokens.update(str(speech).split())
print("Total number of unique tokens in the corpus:", len(all_tokens))

total_tokens = df_raw['speech'].apply(lambda x: len(str(x).split())).sum()
print("Total number of tokens in the corpus:", total_tokens)

# Add a new column: speech length in words
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(str(x).split()))

# Calculate average length
avg_length = df_raw['speech_length_words'].mean()
print("Average speech length (words):", round(avg_length, 2))

# 20 shortest & longest speeches
print("20 shortest speeches:")
print(df_raw.nsmallest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

print("\n20 longest speeches:")
print(df_raw.nlargest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

Total number of unique tokens in the corpus: 59551
Total number of tokens in the corpus: 2222278
Average speech length (words): 2920.21
20 shortest speeches:
            filename           country_name  year  speech_length_words
128  GNB_76_2021.txt          Guinea-Bissau  2021                  480
706  JOR_61_2006.txt                 Jordan  2006                  571
492  IDN_76_2021.txt              Indonesia  2021                  611
692  RWA_63_2008.txt                 Rwanda  2008                  624
425  SVN_69_2014.txt               Slovenia  2014                  693
219  LTU_71_2016.txt              Lithuania  2016                  739
334   EU_74_2019.txt         European Union  2019                  752
463  STP_36_1981.txt  Sao Tome and Principe  1981                  796
80   URY_76_2021.txt                Uruguay  2021                  854
70   HTI_10_1955.txt                  Haiti  1955                  882
307  PRT_76_2021.txt               Portugal  2021            

#### New variable: English as Official Language

In [51]:
# Source for english as official language : https://gradschool.utk.edu/future-students/office-of-graduate-admissions/applying-to-graduate-school/admission-requirements/testing-requirements/countries-with-english-as-official-language/
# They are quoting: https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

# Create dummy column
df_raw['english_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

# Detect unmatched countries 
matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members

# Check df with new variable english_official_language
df_raw.head()


Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
South Sudan
Turks and Caicos Islands
Zimbabwe


Unnamed: 0,filename,speech,country_code,year,country_name,speech_length_words,english_official_language
0,THA_67_2012.txt,﻿On behalf of the\ndelegation of the Kingdom o...,THA,2012,Thailand,1448,0
1,EGY_02_1947.txt,"In the name of my delegation, permit me to ass...",EGY,1947,Egypt,2631,0
2,IRQ_62_2007.txt,I am honoured to address the General Assembly...,IRQ,2007,Iraq,2590,0
3,IRN_10_1955.txt,104. May I be permitted to convey to Mr. Maza ...,IRN,1955,Iran,2315,0
4,VNM_71_2016.txt,I congratulate Mr. Peter Thomson on his electi...,VNM,2016,Vietnam,1412,0


#### New variable: Permanent member security council

In [53]:
# Define permanent members of the UN Security Council and create dummy
permanent_members = ['RUS', 'USA', 'FRA', 'GBR', 'CHN']

df_raw['security_council_permanent'] = df_raw['country_code'].isin(permanent_members).astype(int)

print(df_raw[df_raw['country_code'].isin(permanent_members)][
    ['country_code', 'country_name', 'security_council_permanent', 'year']
])

    country_code    country_name  security_council_permanent  year
26           USA   United States                           1  1990
43           FRA          France                           1  1983
77           CHN           China                           1  1991
97           USA   United States                           1  2019
110          CHN           China                           1  1946
113          FRA          France                           1  1991
116          RUS          Russia                           1  1993
203          RUS          Russia                           1  1971
221          GBR  United Kingdom                           1  2007
227          USA   United States                           1  2002
295          FRA          France                           1  2002
314          CHN           China                           1  2000
347          RUS          Russia                           1  1947
371          RUS          Russia                           1  

In [54]:
print(df_raw)

            filename                                             speech  \
0    THA_67_2012.txt  ﻿On behalf of the\ndelegation of the Kingdom o...   
1    EGY_02_1947.txt  In the name of my delegation, permit me to ass...   
2    IRQ_62_2007.txt  I am  honoured to address the General Assembly...   
3    IRN_10_1955.txt  104. May I be permitted to convey to Mr. Maza ...   
4    VNM_71_2016.txt  I congratulate Mr. Peter Thomson on his electi...   
..               ...                                                ...   
756  SVK_49_1994.txt  I should like to congratulate\nMr. Essy of Côt...   
757  URY_34_1979.txt  ﻿I should like to begin by congratulating Mr. ...   
758  SVN_74_2019.txt  It is a distinct honour for me to address the ...   
759  YMD_23_1968.txt  94. Sir, I greet the President, Mr. Arenales, ...   
760   EU_72_2017.txt  The European Union (EU) stands for freedom and...   

    country_code  year    country_name  speech_length_words  \
0            THA  2012        Thaila

#### New variables: Speaker, Position & Gender

In [56]:
df_speakers = pd.read_excel(os.path.join(data_c, "data_original", "UN General Debate Corpus", "Speakers_by_session.xlsx"))

# Check uniqueness of keys in df_speakers
print(df_speakers.duplicated(subset=['Year', 'ISO Code']).sum())

# Check for duplicates in df_speakers
dupes_speakers = df_speakers[df_speakers.duplicated(subset=['Year', 'ISO Code'], keep=False)]
print(dupes_speakers.sort_values(['Year', 'ISO Code']).head(20))

# for 1958 Iraq Mr. Jomard see https://digitallibrary.un.org/record/380721
# for 1954 Phillipines Mr. Romulo see https://digitallibrary.un.org/record/380429

df_speakers_cleaned = (
    df_speakers[~(
        ((df_speakers['ISO Code'] == "IRQ") & (df_speakers['Year'] == 1958) & (df_speakers['Name of Person Speaking'] == "Mr. Jawad")) |
        ((df_speakers['ISO Code'] == "PHL") & (df_speakers['Year'] == 1954) & (df_speakers['Name of Person Speaking'] == "Mr. SERRANO"))
    )]
    .drop_duplicates(subset=['Year', 'ISO Code'], keep='first')
)
print(df_speakers_cleaned.duplicated(subset=['Year', 'ISO Code']).sum())  # should be 0


17
       Year  Session ISO Code                              Country  \
10506  1951        6      RUS  Union of Soviet Socialist Republics   
10552  1951        6      RUS  Union of Soviet Socialist Republics   
10381  1954        9      PHL                          Philippines   
10415  1954        9      PHL                          Philippines   
10261  1956       11      IRQ                                 Iraq   
10324  1956       11      IRQ                                 Iraq   
10323  1956       11      SYR                                Syria   
10326  1956       11      SYR                                Syria   
10205  1957       12      CSK                       Czechoslovakia   
10243  1957       12      CSK                       Czechoslovakia   
10159  1958       13      BGR                             Bulgaria   
10181  1958       13      BGR                             Bulgaria   
10127  1958       13      CSK                       Czechoslovakia   
10183  1958      

In [57]:
# Supplmentary xlsx-file from the UN Dataset provides information on the speaker and their position

####### CHECK IF THIS WOULD WORK IN A REPLICATION #####################################

# == Create variable speaker_name and position ==

# Merge new infrormation to dataframe
df_merged = df_raw.merge(
    df_speakers_cleaned[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

# Detect unmatched rows
unmatched = df_merged[df_merged['_merge'] == 'left_only']
unmatched_count = (df_merged['_merge'] == 'left_only').sum()

print(unmatched[['filename', 'year', 'country_code', 'country_name']])
print(f"{unmatched_count} rows could not be matched")

# Clean up 
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge']).rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'
})

# == Create gender dummy ==
df_merged['gender_dummy'] = df_merged['speaker_name'].apply(
    lambda name: 0 if pd.notnull(name) and re.search(r'^(?:Mr|Sir)\b', name, re.IGNORECASE)
    else 1 if pd.notnull(name) and re.search(r'^(?:Mrs|Ms)\b', name, re.IGNORECASE)
    else None
)

# Count all values including NaN
counts = df_merged['gender_dummy'].value_counts(dropna=False)

# Build summary using .get() to handle missing keys
gender_summary = pd.DataFrame({
    'gender_dummy': ['0 (male)', '1 (female)', 'NaN (unknown)'],
    'count': [
        counts.get(0, 0),
        counts.get(1, 0),
        counts.get(np.nan, 0)
    ]
})

print(gender_summary)

            filename  year country_code    country_name
123  YMD_26_1971.txt  1971          YMD     South Yemen
566  YMD_33_1978.txt  1978          YMD     South Yemen
607  YMD_32_1977.txt  1977          YMD     South Yemen
618   EU_68_2013.txt  2013           EU  European Union
653  YMD_38_1983.txt  1983          YMD     South Yemen
703  CYP_18_1963.txt  1963          CYP          Cyprus
759  YMD_23_1968.txt  1968          YMD     South Yemen
7 rows could not be matched
    gender_dummy  count
0       0 (male)    328
1     1 (female)     12
2  NaN (unknown)    421


Looking at the structure, highest position always seems to be mentioned first --> drop everything else if speaker has more than one position

In [59]:
dupes_speakers = df_merged[df_merged.duplicated(subset=['year', 'country_code'], keep=False)]
print(dupes_speakers.sort_values(['year', 'country_code']).head(20))

Empty DataFrame
Columns: [filename, speech, country_code, year, country_name, speech_length_words, english_official_language, security_council_permanent, speaker_name, position, gender_dummy]
Index: []


In [60]:
# == Adjust position variable
def normalize_position(pos):
    if pd.isna(pos):
        return pos

    pos = pos.strip()

    # --- Fix common typos and extra spaces ---
    pos = re.sub(r'\s+', ' ', pos)  # collapse multiple spaces
    pos_lower = pos.lower()

     # Turn all ministers that deal with foreign affairs and international relations to "Minister for Foreign Affairs
    foreign_affairs_variants = [
        'minister for foregn affairs',
        'minister responsible for foreign affairs',
        'minsiter for foreign and caricom affairs',
        'minister for external affairs',
        'minister of external relations',  # <-- added
        'foreign minister',
        'minister for international affairs and cooperation',
        'minister for external relations',
        'federal minister for european and international affairs',
        'international cooperation',
        'federal minister for foreign affairs',
        'minister for foreign and caricom affairs',
        'minister of foreign affairs and cooperation',
        'minister for international relations and cooperation',
        'ministry of external relations',
        'acting minister for foreign affairs and international cooperation',
        'ministry of foreign affairs',
        'minister for foreign and political affairs',
        'federal minister for europe, integration, and foreign affairs',
        'federal minister for europe, integration and foreign affairs',
        'minister of foreign and european affaris',
        'minister of foreign affairs',
        'minister for foreign',
        'minister of foreign and european affairs and minister of immigration and asylum',
        'minister for foreign affairs and senegalese living abroad',
        'minister for foreign affairs with responsibility for brexit',
        'minister for foreign affairs and investment promotion'
       
    ]
    if any(variant in pos_lower for variant in foreign_affairs_variants):
        return "Minister for Foreign Affairs"

    # --- Fix "rime minister" typo ---
    pos = re.sub(r'(?i)\brime[- ]?minister\b', 'Prime Minister', pos)

    # Normalize different versions of Head of Government, President, Prime Minsiter and Vice-President-
    exact_matches = {
        r'(?i)^president of (the )?government$': 'Head of Government',
        r'(?i)^acting president$': 'President',
        r'(?i)^interim president$': 'President',
        r'(?i)^constitutional president$': 'President',
        r'(?i)^first executive president$': 'President',
        r'(?i)^first prime[- ]?minister$': 'Prime Minister',
        r'(?i)^head of the goverment$': 'Head of Government',  # <-- catch typo + spaces
        r'(?i)^head\s+of\s+govern?ment$': 'Head of Government',
        r'(?i)^first vice[- ]?president$': 'Vice-President'
    }
    for pattern, replacement in exact_matches.items():
        if re.fullmatch(pattern, pos):
            return replacement

    # --- Normalize prefixes ---
    pos = re.sub(r'(?i)^first vice[- ]?president\b', 'Vice-President', pos)
    pos = re.sub(r'(?i)\bprime[- ]?minister\b', 'Prime Minister', pos)
    pos = re.sub(r'(?i)\bpresident\b', 'President', pos)
    pos = re.sub(r'(?i)\bvice[- ]?president\b', 'Vice-President', pos)

    # --- Collapse primary roles if they appear at start ---
    primary_roles = [
        (r'(?i)^prime[- ]?minister\b', 'Prime Minister'),
        (r'(?i)^deputy prime[- ]?minister\b', 'Deputy Prime Minister'),
        (r'(?i)^president\b', 'President'),
        (r'(?i)^vice[- ]?president\b', 'Vice-President'),
        (r'(?i)^head of state\b', 'Head of State'),
        (r'(?i)^(crown prince|prince|king|emir|amir)\b', 'Monarch'),
        (r'(?i)^(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)\b', 'Diplomatic Representative')
    ]
    for pattern, replacement in primary_roles:
        if re.match(pattern, pos):
            return replacement

    # --- Monarchs ---
    if re.search(r'(?i)\b(crown prince|prince|king|emir|amir)\b', pos):
        return "Monarch"

    # --- Head of State ---
    if re.search(r'(?i)head of state', pos):
        return "Head of State"
        
    # --- Diplomatic Representatives ---
    if re.search(r'(?i)(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)', pos):
        return "Diplomatic Representative"

    # --- Everything else ---
    print("Unmatched position:", pos)  # print before assigning Others
    return "Others"

# Apply
df_merged["position"] = df_merged["position"].apply(normalize_position)

Unmatched position: Cairman of the Presidency
Unmatched position: Emperor
Unmatched position: Chairman
Unmatched position: Secretary of State
Unmatched position: Coordinator of the Junta of the Government


In [61]:
def merge_positions(pos):
    if pd.isna(pos):
        return pos  # keep NaN
    
    if pos in ["Prime Minister", "Deputy Prime Minister"]:
        return "(Deputy) Prime Minister"
    
    if pos in ["President", "Vice-President"]:
        return "(Vice-) President"
        
    if pos in ["Minister for Foreign Affairs", "Deputy Minister for Foreign Affairs",
        "Deputy Minister Foreign Affairs",
        "Second Minister for Foreign Affairs",
        "Second Minister for Foreign Affairs and Trade",
        "Vice Minister for Foreign Affairs"]:
        return "(Deputy) Minister for Foreign Affairs"
    
    return pos

df_merged["position"] = df_merged["position"].apply(merge_positions)

In [62]:
# Pandas so einstellen, dass es alles ausgibt
pd.set_option("display.max_rows", None)

# Alle Positionen mit Häufigkeit
position_counts = df_merged['position'].value_counts(dropna=False)

print(position_counts)

NaN                                      320
(Deputy) Minister for Foreign Affairs    155
(Vice-) President                        149
(Deputy) Prime Minister                  103
Diplomatic Representative                 20
Others                                     5
Head of State                              4
Head of Government                         3
Monarch                                    2
Name: position, dtype: int64


In [63]:
# Started to document positions properly from 1986 on, before yearly sample size per year mostly less than 20 samples

yearly_counts = df_merged.groupby('year')['position'].agg(
    total_rows='size',
    missing=lambda x: x.isna().sum()
)

# Add not_missing column
yearly_counts['not_missing'] = yearly_counts['total_rows'] - yearly_counts['missing']


# Print the entire table
pd.set_option('display.max_rows', None)  # show all rows
print(yearly_counts)
pd.reset_option('display.max_rows')

      total_rows  missing  not_missing
year                                  
1946           3        3            0
1947           3        3            0
1948           4        4            0
1949           2        2            0
1950           3        3            0
1951           3        3            0
1952           3        3            0
1953           2        2            0
1954           2        2            0
1955           4        4            0
1956           4        4            0
1957           3        3            0
1958           2        2            0
1959           7        7            0
1960           6        4            2
1961           4        4            0
1962           4        4            0
1963           7        7            0
1964           9        9            0
1965           7        7            0
1966           6        6            0
1967           7        7            0
1968           8        8            0
1969           8        7

#### New Variable: Country (Year)

This variable is later needed to create clean description plots and tables

In [65]:
df_merged = df_merged.copy()
df_merged['speech_label'] = df_merged['country_name'] + " (" + df_merged['year'].astype(str) + ")"

#### Save dataframe with all new variables as un_corpus_merged

In [67]:
os.chdir(wd)

# Save df_merged as a pickle file for quick future loading
merged_pickle_path = r".\data\un_corpus_merged.pkl"
df_merged.to_pickle(merged_pickle_path)

# Export df as CSV 
merged_output_path = r".\data\un_corpus_merged.csv"
df_merged.to_csv(merged_output_path, index=False, sep=';', encoding='utf-8')

___

## Pre-processing

#### Cleaning

In [80]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

def cleaning(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Ensure spacing after punctuation
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" → "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)

    # Remove stray backslashes
    content = content.replace("\\", "")

    return content

# Apply cleaning to each speech
df_merged['speech'] = df_merged['speech'].astype(str)  # Ensure column is string type
df_clean = df_merged.copy()
df_clean['speech'] = df_clean['speech'].apply(cleaning)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [81]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'clean_speeches_indexed1.pkl')
joblib.dump(data_id2, 'clean_speeches_indexed2.pkl')
joblib.dump(data_id3, 'clean_speeches_indexed3.pkl')
joblib.dump(data_id4, 'clean_speeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
clean_files = [
    os.path.join(data_temp, 'clean_speeches_indexed1.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed2.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed3.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed4.pkl')
]

print(f"Saved clean speeches chunks in '{data_temp}'")

Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

#### Extend stopwords list

In [92]:
# Full path to stopwords pickle
stopwords_path = os.path.join(data_c, "stopwords.pkl")

# Load stopwords
stopwords = joblib.load(stopwords_path)
# Sort alphabetically
stopwords = sorted(stopwords)

print(f"Loaded {len(stopwords)} stopwords (sorted alphabetically)")
print(stopwords[:500])  # show first 50 for a quick check

Loaded 18997 stopwords (sorted alphabetically)
['88', 'a', 'a.', 'aandahl', 'aaron', 'aaronsburg', 'aarp', 'abac', 'abbevill', 'abbitt', 'abbot', 'abbotsford', 'abbott', 'abbottstown', 'abbyvill', 'abdnor', 'abe', 'abel', 'abercrombi', 'aberdeen', 'abern', 'abernathi', 'abernethi', 'abi', 'abiel', 'abijah', 'abilen', 'abingdon', 'abington', 'abiquiu', 'abmp', 'abner', 'abourezk', 'about', 'abov', 'abraham', 'abram', 'absalom', 'absaraka', 'absaroke', 'absecon', 'abzug', 'acadia', 'acampo', 'accid', 'accokeek', 'accomac', 'accomack', 'accord', 'accovill', 'ace', 'acevedo-vilã¡', 'acheson', 'achill', 'acker', 'ackerman', 'ackermanvill', 'acklen', 'ackley', 'ackworth', 'acm', 'acosta', 'acra', 'acton', 'acushnet', 'acworth', 'ada', 'adah', 'adair', 'adairsvill', 'adairvill', 'adak', 'adam', 'adamsburg', 'adamson', 'adamstown', 'adamsvill', 'addabbo', 'addam', 'addi', 'addievill', 'addington', 'addison', 'addonizio', 'addyston', 'adel', 'adelanto', 'adelbert', 'adelphi', 'adelphia', 'adena

In [97]:
# Get SpaCy stopwords
SPACY_STOPWORDS = list(nlp.Defaults.stop_words)
# Save stopwords
#joblib.dump(SPACY_STOPWORDS, stopwords_path)

#print(f"Saved {len(SPACY_STOPWORDS)} stopwords to {stopwords_path}")

In [141]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove tokens that are only digits
        a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
        return a
    
def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]


def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]  # tag each tokenlist
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t


#def tags_spacy(lista):
   # texts = [' '.join(row[1]) for row in lista]
   # docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
   # result = []
   # for i, doc in enumerate(docs):
     #   filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
      #  result.append([lista[i][0], filtered_tokens])
   # return result


def pro5(lista):
    return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
    
def pro6(lista):
    # Remove stopwords using stopword list # No source how this list was created?
    return [[row[0], [w for w in row[1] if w not in stopwords]] for row in lista]
      
########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [145]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
    # Apply stemming
    data = pro6(data)
    
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # Store preprocessed corupus (before stemming) for wordcloud
    #filename_wordcloud = data_name.replace('clean_speeches_', 'wordcloud_speeches_').replace('.pkl', '.pkl')
    #out_name_wordcloud = os.path.join(data_preprocessed, os.path.basename(filename_wordcloud))
    #joblib.dump(data, out_name_wordcloud)

    filename_preprocessed = data_name.replace('clean_speeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in clean_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Before tagging: 1.64s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] After tagging: 28.52s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Done. Total time: 188.58s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] Before tagging: 1.50s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] After tagging: 27.26s
[C:\Users\

In [147]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

wordcloud_files = [
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed4.pkl')
]

In [149]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed


In [151]:
# Initialize a set to store all unique tokens
all_unique_tokens = set()

for dataname in preprocessed_files:
    data = joblib.load(dataname)  # load list of [speech_id, tokenlist]
    for _id, tokenlist in data:
        all_unique_tokens.update(tokenlist)  # add tokens to the set

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")

Total unique tokens across all files: 12767


---

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [156]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

#def remove_rare_words(filenames, freqs, min_count=10):
   # for fname in filenames:
       # data = joblib.load(fname)
       # filtered_data = []
        #for doc_id, tokens in data:
          #  filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
          #  filtered_data.append([doc_id, filtered_tokens])
       # joblib.dump(filtered_data, fname)  # overwrite or save as new file
       # print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts = count_frequencies(preprocessed_files)

#remove_rare_words(preprocessed_files, word_counts, min_count=10)

print("\n[Stemmed] Top 50 most common words:")
for word, count in word_counts.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in word_counts.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path = os.path.join(data_freq, 'word_counts.pkl')
joblib.dump(word_counts, save_path)

# === Count for wordcloud (unstemmed) speeches ===
#word_counts_wordcloud = count_frequencies(wordcloud_files)

#print("\n[Wordcloud] Top 50 most common words:")
#for word, count in word_counts_wordcloud.most_common(50):
    #print(f"{word}: {count}")

#print("\n[Wordcloud] Top 50 least common words:")
#for word, count in word_counts_wordcloud.most_common()[-50:]:
    #print(f"{word}: {count}")

# Save unstemmed word counts
#save_path_wordcloud = os.path.join(data_freq, 'word_counts_wordcloud.pkl')
#joblib.dump(word_counts_wordcloud, save_path_wordcloud)


100%|██████████| 4/4 [00:02<00:00,  1.61it/s]


[Stemmed] Top 50 most common words:
nation: 16625
countri: 12472
intern: 11073
develop: 9680
world: 9342
peopl: 8994
secur: 5873
general: 5420
govern: 5145
econom: 4935
year: 4769
right: 4709
assembl: 4252
problem: 4106
support: 3929
human: 3874
continu: 3756
polit: 3501
region: 3425
communiti: 3423
time: 3322
need: 2939
import: 2884
make: 2675
achiev: 2665
situat: 2614
conflict: 2566
resolut: 2558
principl: 2556
global: 2474
presid: 2470
relat: 2419
take: 2414
great: 2411
africa: 2368
solut: 2364
oper: 2335
nuclear: 2333
concern: 2306
order: 2304
action: 2293
made: 2267
confer: 2217
establish: 2207
commit: 2140
polici: 2115
part: 2102
respect: 2088
chang: 2088
interest: 2086

[Stemmed] Top 50 least common words:
keita: 1
roadmap: 1
politick: 1
popularis: 1
haemorrhag: 1
elysé: 1
depositor: 1
shortsel: 1
disclosur: 1
drawbridg: 1
rebuf: 1
refram: 1
macro: 1
midrand: 1
kwasniewski: 1
triad: 1
allur: 1
indigest: 1
lunchtim: 1
peror: 1
wafer: 1
oaij: 1
prima: 1
nwaliau: 1
yerevan: 1
bioe




['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts.pkl']

In [157]:
# Filter words with frequency >= 10
min_count = 10
frequent_words = {word: count for word, count in word_counts.items() if count >= min_count}

# Number of unique words appearing at least 10 times
num_unique_frequent_words = len(frequent_words)
print(f"Number of unique words with frequency >= {min_count}: {num_unique_frequent_words}")


Number of unique words with frequency >= 10: 4293


In [158]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [160]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

print("Contents of affect dictionary:")
print(affect)
print("Number of words in affect dictionary:", len(affect))

print("\nContents of cognition dictionary:")
print(cognition)
print("Number of words in cognition dictionary:", len(cognition))

#with open(affect_path, 'rb') as f:
  #  affect_dict = pickle.load(f)
#print("Contents of affect dictionary:")
#print(affect_dict)
#print("Number of words in affect dictionary:", len(affect_dict))

#with open(cognition_path, 'rb') as f:
  #  cognition_dict = pickle.load(f)
#print("Contents of cognition dictionary:")
#print(cognition_dict)
#print("Number of words in cognition dictionary:", len(cognition_dict))

a = [[i, word_counts[i]] for i in affect if i in word_counts]
c = [[i, word_counts[i]] for i in cognition if i in word_counts]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)


# == Calculate weighted frequencies for all words

# - downweights very common words by giving more importance to rare ones
#word_counts = joblib.load(os.path.join(data_freq, 'word_counts.pkl'))

l = sum(word_counts.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

---

## Final Cleaning

In [163]:
os.chdir(data_c)
os.chdir(data_freq)

count = joblib.load('word_counts.pkl')  # load stemmed counts

# For each speech only keep tokens that appear at least 10x

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if count.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    cleaned_path = data_path.replace('.pkl', '_final.pkl')
    joblib.dump(data, cleaned_path)

# Initialize a set to store all unique tokens
all_unique_tokens = set()

for dataname in preprocessed_files:
    data = joblib.load(dataname)  # load list of [speech_id, tokenlist]
    for _id, tokenlist in data:
        all_unique_tokens.update(tokenlist)  # add tokens to the set

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")

Total unique tokens across all files: 12767


In [168]:
import os
import joblib

os.chdir(data_preprocessed)

final_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl')
]

# Initialize a set to store all unique tokens
all_unique_tokens = set()
total_entries = 0

for dataname in final_files:
    data = joblib.load(dataname)
    total_entries += len(data)
    for _, tokenlist in data:
        all_unique_tokens.update(tokenlist)

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")
print(f"Total number of speeches processed: {total_entries}")


Total unique tokens across all files: 4293
Total number of speeches processed: 761
