# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
## Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- Set your working directory appropriately.  
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the data from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Data Cleaning and Pre-Processing
- Count word frequencies and weight themh

___

## Setup, Installation and Verification of required Packages and Libraries

In [69]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "wordcloud",
        "matplotlib",
        "tqdm"
    ]

    for package in packages:
        !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

#########################
# Check if all packages are included
##########################

In [71]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path

# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stemmer = SnowballStemmer("english")

In [72]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp", "tokenized"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")

# Prompt user to place raw data files
#print(f"\nPlease place your raw data files (unzipped) into the folder:\n  {data_path / 'data_original'}")
#input("Press Enter after you have placed the files to continue...")

#print("Continuing with the script...")



Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
  - tokenized


In [75]:
# === Define Folder Paths ===

# If an error occurs, make sure that you actually have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
data_tokenized = os.path.join(data_c, 'tokenized')
fig = os.path.join(wd, 'fig')

In [77]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


___

## Load and Prepare Corpus

### This chunk can be skipped at the moment
### Think of proper header

In [82]:
# == Load Sample from UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

# Collect txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,1000)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

df_raw = pd.DataFrame(raw_data)

# == Store as csv & pkl ==

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


Total speeches found: 10761

 Saved raw data with 1000 speeches to '.\data\un_corpus_raw.csv'


In [83]:
# == Load data & drop empty speeches ==

df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# Drop empty speeches
df_raw['speech'] = df_raw['speech'].astype(str)
df_raw = df_raw[df_raw['speech'].str.strip() != ''].copy()

df_raw.head()         


Unnamed: 0,filename,speech
0,SYR_04_1949.txt,Fayez EL-KHOURI Bey explained that although hi...
1,PRK_57_2002.txt,"﻿I would like to\ncongratulate you, Sir, on yo..."
2,LVA_76_2021.txt,Let me begin by congratulating the Minister fo...
3,RWA_29_1974.txt,"Mr. President, your unanimous election to the..."
4,BFA_26_1971.txt,"56.\t Mr. President, allow me to congratulate ..."


___

## Create new variables

#### New Variables: Year, Country Code and Country Name

In [87]:
# == Create variable: country code & year

# Create contry_code and year variable
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())
# Speeches range from 1946 to 2023

# == Create variable: country_name by matching ISO country code 
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "South Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
}

code_to_name.update(custom_names)
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# == Check structure ==

df_raw.head() 

Min year: 1946
Max year: 2023
Missing codes: []


Unnamed: 0,filename,speech,country_code,year,country_name
0,SYR_04_1949.txt,Fayez EL-KHOURI Bey explained that although hi...,SYR,1949,Syria
1,PRK_57_2002.txt,"﻿I would like to\ncongratulate you, Sir, on yo...",PRK,2002,North Korea
2,LVA_76_2021.txt,Let me begin by congratulating the Minister fo...,LVA,2021,Latvia
3,RWA_29_1974.txt,"Mr. President, your unanimous election to the...",RWA,1974,Rwanda
4,BFA_26_1971.txt,"56.\t Mr. President, allow me to congratulate ...",BFA,1971,Burkina Faso


In [88]:
# == Check country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            AND                               Andorra
4            ARE                  United Arab Emirates
5            ARG                             Argentina
6            ARM                               Armenia
7            ATG                   Antigua and Barbuda
8            AUS                             Australia
9            AUT                               Austria
10           AZE                            Azerbaijan
11           BDI                               Burundi
12           BEL                               Belgium
13           BEN                                 Benin
14           BFA                          Burkina Faso
15           BGD                            Bangladesh
16           BGR                              Bulgaria
17        

#### New Variable: Length of speeches

In [90]:
# Add a new column: speech length in words
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(str(x).split()))

# Calculate average length
avg_length = df_raw['speech_length_words'].mean()
print("Average speech length (words):", round(avg_length, 2))

# 20 shortest & longest speeches
print("20 shortest speeches:")
print(df_raw.nsmallest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

print("\n20 longest speeches:")
print(df_raw.nlargest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

Average speech length (words): 2943.8
20 shortest speeches:
            filename           country_name  year  speech_length_words
592  RWA_70_2015.txt                 Rwanda  2015                  539
151  SAU_01_1946.txt           Saudi Arabia  1946                  555
695  BRN_65_2010.txt      Brunei Darussalam  2010                  639
232  RWA_71_2016.txt                 Rwanda  2016                  682
525  CZE_70_2015.txt                Czechia  2015                  694
438  BRN_53_1998.txt      Brunei Darussalam  1998                  735
179  STP_36_1981.txt  Sao Tome and Principe  1981                  796
960  BTN_58_2003.txt                 Bhutan  2003                  821
135  UKR_69_2014.txt                Ukraine  2014                  834
710  ERI_70_2015.txt                Eritrea  2015                  835
871  LBR_04_1949.txt                Liberia  1949                  885
714  RWA_68_2013.txt                 Rwanda  2013                  913
132  ITA_73_2018.

#### New variable: English as Official Language

In [92]:
# Source for english as official language : https://gradschool.utk.edu/future-students/office-of-graduate-admissions/applying-to-graduate-school/admission-requirements/testing-requirements/countries-with-english-as-official-language/
# They are quoting: https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

# Create dummy column
df_raw['english_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

# Detect unmatched countries 
matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members

# Check df with new variable english_official_language
df_raw.head()


Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
Turks and Caicos Islands


Unnamed: 0,filename,speech,country_code,year,country_name,speech_length_words,english_official_language
0,SYR_04_1949.txt,Fayez EL-KHOURI Bey explained that although hi...,SYR,1949,Syria,2930,0
1,PRK_57_2002.txt,"﻿I would like to\ncongratulate you, Sir, on yo...",PRK,2002,North Korea,2011,0
2,LVA_76_2021.txt,Let me begin by congratulating the Minister fo...,LVA,2021,Latvia,1693,0
3,RWA_29_1974.txt,"Mr. President, your unanimous election to the...",RWA,1974,Rwanda,4963,1
4,BFA_26_1971.txt,"56.\t Mr. President, allow me to congratulate ...",BFA,1971,Burkina Faso,4537,0


#### New variable: Permanent member security council

In [94]:
# Define permanent members of the UN Security Council and create dummy
permanent_members = ['RUS', 'USA', 'FRA', 'GBR', 'CHN']

df_raw['security_council_permanent'] = df_raw['country_code'].isin(permanent_members).astype(int)

print(df_raw[df_raw['country_code'].isin(permanent_members)][
    ['country_code', 'country_name', 'security_council_permanent', 'year']
])

    country_code    country_name  security_council_permanent  year
32           GBR  United Kingdom                           1  1998
45           FRA          France                           1  2010
95           USA   United States                           1  1947
101          GBR  United Kingdom                           1  2009
122          GBR  United Kingdom                           1  2018
216          GBR  United Kingdom                           1  2003
261          CHN           China                           1  1947
275          FRA          France                           1  1982
286          RUS          Russia                           1  1952
356          USA   United States                           1  2011
376          RUS          Russia                           1  2006
384          FRA          France                           1  1994
448          FRA          France                           1  1986
472          CHN           China                           1  

#### New variables: Speaker, Position & Gender

In [96]:
# Supplmentary xlsx-file from the UN Dataset provides information on the speaker and their position

# == Create variable speaker_name and position ==
df_speakers = pd.read_excel(os.path.join(data_c, "data_original", "UN General Debate Corpus", "Speakers_by_session.xlsx"))

df_speakers.head()

# Merge new infrormation to dataframe
df_merged = df_raw.merge(
    df_speakers[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

# Detect unmatched rows
unmatched = df_merged[df_merged['_merge'] == 'left_only']
unmatched_count = (df_merged['_merge'] == 'left_only').sum()

print(unmatched[['filename', 'year', 'country_code', 'country_name']])
print(f"{unmatched_count} rows could not be matched")

# Clean up 
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge']).rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'
})

# == Create gender dummy ==
df_merged['gender_dummy'] = df_merged['speaker_name'].apply(
    lambda name: 0 if pd.notnull(name) and re.search(r'^(?:Mr|Sir)\b', name, re.IGNORECASE)
    else 1 if pd.notnull(name) and re.search(r'^(?:Mrs|Ms)\b', name, re.IGNORECASE)
    else None
)

# Count all values including NaN
counts = df_merged['gender_dummy'].value_counts(dropna=False)

# Build summary using .get() to handle missing keys
gender_summary = pd.DataFrame({
    'gender_dummy': ['0 (male)', '1 (female)', 'NaN (unknown)'],
    'count': [
        counts.get(0, 0),
        counts.get(1, 0),
        counts.get(np.nan, 0)
    ]
})

print(gender_summary)

# == Adjust position variable



            filename  year country_code  country_name
66   YMD_30_1975.txt  1975          YMD   South Yemen
164  YMD_24_1969.txt  1969          YMD   South Yemen
297  GAB_18_1963.txt  1963          GAB         Gabon
307  YMD_43_1988.txt  1988          YMD   South Yemen
384  ZAF_39_1984.txt  1984          ZAF  South Africa
389  YMD_29_1974.txt  1974          YMD   South Yemen
470  MDG_18_1963.txt  1963          MDG    Madagascar
855  YMD_40_1985.txt  1985          YMD   South Yemen
943  YMD_34_1979.txt  1979          YMD   South Yemen
9 rows could not be matched
    gender_dummy  count
0       0 (male)    415
1     1 (female)     16
2  NaN (unknown)    574


Looking at the structure, highest position always seems to be mentioned first --> drop everything else if speaker has more than one position

In [98]:
def normalize_position(pos):
    if pd.isna(pos):
        return pos

    pos = pos.strip()

    # --- Fix common typos and extra spaces ---
    pos = re.sub(r'\s+', ' ', pos)  # collapse multiple spaces
    pos_lower = pos.lower()

     # Turn all ministers that deal with foreign affairs and international relations to "Minister for Foreign Affairs
    foreign_affairs_variants = [
        'minister for foregn affairs',
        'minister responsible for foreign affairs',
        'minsiter for foreign and caricom affairs',
        'minister for external affairs',
        'minister of external relations',  # <-- added
        'foreign minister',
        'minister for international affairs and cooperation',
        'minister for external relations',
        'federal minister for european and international affairs',
        'international cooperation',
        'federal minister for foreign affairs',
        'minister for foreign and caricom affairs',
        'minister of foreign affairs and cooperation',
        'minister for international relations and cooperation',
        'ministry of external relations',
        'acting minister for foreign affairs and international cooperation',
        'ministry of foreign affairs',
        'minister for foreign and political affairs',
        'federal minister for europe, integration, and foreign affairs',
        'federal minister for europe, integration and foreign affairs',
        'minister of foreign and european affaris',
        'minister of foreign affairs',
        'minister for foreign',
        'minister of foreign and european affairs and minister of immigration and asylum',
        'minister for foreign affairs and senegalese living abroad',
        'minister for foreign affairs with responsibility for brexit',
        'minister for foreign affairs and investment promotion'
       
    ]
    if any(variant in pos_lower for variant in foreign_affairs_variants):
        return "Minister for Foreign Affairs"

    # --- Fix "rime minister" typo ---
    pos = re.sub(r'(?i)\brime[- ]?minister\b', 'Prime Minister', pos)

    # Normalize different versions of Head of Government, President, Prime Minsiter and Vice-President-
    exact_matches = {
        r'(?i)^president of (the )?government$': 'Head of Government',
        r'(?i)^acting president$': 'President',
        r'(?i)^interim president$': 'President',
        r'(?i)^constitutional president$': 'President',
        r'(?i)^first executive president$': 'President',
        r'(?i)^first prime[- ]?minister$': 'Prime Minister',
        r'(?i)^head of the goverment$': 'Head of Government',  # <-- catch typo + spaces
        r'(?i)^head\s+of\s+govern?ment$': 'Head of Government',
        r'(?i)^first vice[- ]?president$': 'Vice-President'
    }
    for pattern, replacement in exact_matches.items():
        if re.fullmatch(pattern, pos):
            return replacement

    # --- Normalize prefixes ---
    pos = re.sub(r'(?i)^first vice[- ]?president\b', 'Vice-President', pos)
    pos = re.sub(r'(?i)\bprime[- ]?minister\b', 'Prime Minister', pos)
    pos = re.sub(r'(?i)\bpresident\b', 'President', pos)
    pos = re.sub(r'(?i)\bvice[- ]?president\b', 'Vice-President', pos)

    # --- Collapse primary roles if they appear at start ---
    primary_roles = [
        (r'(?i)^prime[- ]?minister\b', 'Prime Minister'),
        (r'(?i)^deputy prime[- ]?minister\b', 'Deputy Prime Minister'),
        (r'(?i)^president\b', 'President'),
        (r'(?i)^vice[- ]?president\b', 'Vice-President'),
        (r'(?i)^head of state\b', 'Head of State'),
        (r'(?i)^(crown prince|prince|king|emir|amir)\b', 'Monarch'),
        (r'(?i)^(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)\b', 'Diplomatic Representative')
    ]
    for pattern, replacement in primary_roles:
        if re.match(pattern, pos):
            return replacement

    # --- Monarchs ---
    if re.search(r'(?i)\b(crown prince|prince|king|emir|amir)\b', pos):
        return "Monarch"

    # --- Head of State ---
    if re.search(r'(?i)head of state', pos):
        return "Head of State"
        
    # --- Diplomatic Representatives ---
    if re.search(r'(?i)(un representative|permanent representative|delegation|chair of (the )?delegation|chair of diplomatic representative)', pos):
        return "Diplomatic Representative"

    # --- Everything else ---
    print("Unmatched position:", pos)  # print before assigning Others
    return "Others"

# Apply
df_merged["position"] = df_merged["position"].apply(normalize_position)

Unmatched position: Chief Executive Officer
Unmatched position: Chairman of the Council of Ministers
Unmatched position: Chairman of the Presidential Council
Unmatched position: Minister of State
Unmatched position: Emperor
Unmatched position: Member of the Presidency
Unmatched position: Chairman


In [99]:
def merge_positions(pos):
    if pd.isna(pos):
        return pos  # keep NaN
    
    if pos in ["Prime Minister", "Deputy Prime Minister"]:
        return "(Deputy) Prime Minister"
    
    if pos in ["President", "Vice-President"]:
        return "(Vice-) President"
        
    if pos in ["Minister for Foreign Affairs", "Deputy Minister for Foreign Affairs",
        "Deputy Minister Foreign Affairs",
        "Second Minister for Foreign Affairs",
        "Second Minister for Foreign Affairs and Trade",
        "Vice Minister for Foreign Affairs"]:
        return "(Deputy) Minister for Foreign Affairs"
    
    return pos

df_merged["position"] = df_merged["position"].apply(merge_positions)

In [100]:
# Pandas so einstellen, dass es alles ausgibt
pd.set_option("display.max_rows", None)

# Alle Positionen mit Häufigkeit
position_counts = df_merged['position'].value_counts(dropna=False)

print(position_counts)

NaN                                      420
(Deputy) Minister for Foreign Affairs    221
(Vice-) President                        178
(Deputy) Prime Minister                  128
Diplomatic Representative                 42
Others                                     7
Head of State                              5
Monarch                                    2
Head of Government                         2
Name: position, dtype: int64


In [101]:
# Started to document positions properly from 1986 on, before yearly sample size per year mostly less than 20 samples
# 
yearly_counts = df_merged.groupby('year')['position'].agg(
    total_rows='size',
    missing=lambda x: x.isna().sum()
)

# Add not_missing column
yearly_counts['not_missing'] = yearly_counts['total_rows'] - yearly_counts['missing']


# Print the entire table
pd.set_option('display.max_rows', None)  # show all rows
print(yearly_counts)
pd.reset_option('display.max_rows')

      total_rows  missing  not_missing
year                                  
1946           2        2            0
1947           4        4            0
1948           2        2            0
1949           6        6            0
1950           3        3            0
1951           6        6            0
1952           2        2            0
1953           3        3            0
1954           5        5            0
1955           4        4            0
1956           8        8            0
1957           6        6            0
1958           9        9            0
1959          11       10            1
1960          11        7            4
1961           4        4            0
1962           3        3            0
1963           5        4            1
1964           8        8            0
1965          12       12            0
1966          14       14            0
1967           7        7            0
1968           5        5            0
1969          11       10

#### New Variable: Country (Year)

This variable is later needed to create clean description plots and tables

In [103]:
df_merged = df_merged.copy()
df_merged['speech_label'] = df_merged['country_name'] + " (" + df_merged['year'].astype(str) + ")"

#### Save dataframe with all new variables as ?????

In [105]:
os.chdir(wd)

# Save df_merged as a pickle file for quick future loading
merged_pickle_path = r".\data\un_corpus_merged.pkl"
df_merged.to_pickle(merged_pickle_path)

# Export df as CSV 
merged_output_path = r".\data\un_corpus_merged.csv"
df_merged.to_csv(merged_output_path, index=False, sep=';', encoding='utf-8')

___

## Pre-processing

#### Cleaning

In [109]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Ensure spacing after punctuation
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" → "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)

    # Remove stray backslashes
    content = content.replace("\\", "")

    return content

# Apply cleaning to each speech
df_merged['speech'] = df_merged['speech'].astype(str)  # Ensure column is string type
df_clean = df_merged.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [110]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'clean_speeches_indexed1.pkl')
joblib.dump(data_id2, 'clean_speeches_indexed2.pkl')
joblib.dump(data_id3, 'clean_speeches_indexed3.pkl')
joblib.dump(data_id4, 'clean_speeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
clean_files = [
    os.path.join(data_temp, 'clean_speeches_indexed1.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed2.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed3.pkl'),
    os.path.join(data_temp, 'clean_speeches_indexed4.pkl')
]

print(f"✅ Saved clean speeches chunks in '{data_temp}'")

✅ Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

#### Extend stopwords list

In [113]:
# Get SpaCy stopwords
SPACY_STOPWORDS = list(nlp.Defaults.stop_words)

# Path to save
stopwords_path = os.path.join(data_c, "stopwords.pkl")

# Save stopwords
joblib.dump(SPACY_STOPWORDS, stopwords_path)

print(f"Saved {len(SPACY_STOPWORDS)} stopwords to {stopwords_path}")

Saved 326 stopwords to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\stopwords.pkl


In [114]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove tokens that are only digits
        a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
        return a
    
def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result


def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
      return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [115]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
   # data = pro6(data)
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # Store preprocessed corupus (before stemming) for wordcloud
    filename_wordcloud = data_name.replace('clean_speeches_', 'wordcloud_speeches_').replace('.pkl', '.pkl')
    out_name_wordcloud = os.path.join(data_preprocessed, os.path.basename(filename_wordcloud))
    joblib.dump(data, out_name_wordcloud)

    # Apply stemming
    data_stemmed = pro6(data)

    filename_preprocessed = data_name.replace('clean_speeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data_stemmed, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in clean_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Before tagging: 1.71s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] After tagging: 71.08s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl] Done. Total time: 82.56s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] Before tagging: 2.01s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl] After tagging: 70.45s
[C:\Users\s

In [116]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

wordcloud_files = [
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed4.pkl')
]

In [117]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


---

## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [142]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

def remove_rare_words(filenames, freqs, min_count=10):
    for fname in filenames:
        data = joblib.load(fname)
        filtered_data = []
        for doc_id, tokens in data:
            filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
            filtered_data.append([doc_id, filtered_tokens])
        joblib.dump(filtered_data, fname)  # overwrite or save as new file
        print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts_stemmed = count_frequencies(preprocessed_files)

remove_rare_words(preprocessed_files, word_counts_stemmed, min_count=10)

print("\n[Stemmed] Top 50 most common words:")
for word, count in word_counts_stemmed.most_common(50):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 50 least common words:")
for word, count in word_counts_stemmed.most_common()[-50:]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path_stemmed = os.path.join(data_freq, 'word_counts_stemmed.pkl')
joblib.dump(word_counts_stemmed, save_path_stemmed)

# === Count for wordcloud (unstemmed) speeches ===
word_counts_wordcloud = count_frequencies(wordcloud_files)

print("\n[Wordcloud] Top 50 most common words:")
for word, count in word_counts_wordcloud.most_common(50):
    print(f"{word}: {count}")

print("\n[Wordcloud] Top 50 least common words:")
for word, count in word_counts_wordcloud.most_common()[-50:]:
    print(f"{word}: {count}")

# Save unstemmed word counts
save_path_wordcloud = os.path.join(data_freq, 'word_counts_wordcloud.pkl')
joblib.dump(word_counts_wordcloud, save_path_wordcloud)


100%|██████████| 4/4 [00:03<00:00,  1.06it/s]


Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed2.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed3.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed4.pkl: removed words with freq < 10

[Stemmed] Top 50 most common words:
nation: 21947
unit: 18917
countri: 16487
intern: 15244
develop: 13528
state: 12937
peac: 12471
world: 12267
peopl: 12013
secur: 8200
general: 7574
govern: 7386
econom: 6870
organ: 6551
year: 6163
right: 6139
assembl: 5964
new: 5573
effort: 5522
problem: 5257
support: 5198
human: 5178
continu: 5149
communiti: 4752
region: 4503
time: 4475
polit: 4447
africa: 4264
member: 4178
session: 4094
council: 406

100%|██████████| 4/4 [00:03<00:00,  1.10it/s]



[Wordcloud] Top 50 most common words:
united: 18635
nations: 17662
international: 14599
world: 12040
countries: 11120
peace: 10283
states: 9261
development: 7906
security: 7687
people: 7417
general: 7375
economic: 6835
assembly: 5817
new: 5573
government: 5550
country: 5171
organization: 4897
peoples: 4592
efforts: 4570
human: 4540
community: 4382
political: 4230
africa: 4094
rights: 4011
council: 3973
session: 3905
support: 3862
time: 3674
war: 3426
republic: 3385
south: 3286
state: 3269
years: 3144
problems: 3132
great: 3091
national: 3067
situation: 3036
nuclear: 3025
order: 3020
year: 3010
developing: 2909
global: 2863
work: 2860
social: 2809
president: 2751
conference: 2680
hope: 2636
african: 2620
charter: 2589
important: 2559

[Wordcloud] Top 50 least common words:
savages: 1
godmothers: 1
raiding: 1
implication: 1
vacated: 1
mischievous: 1
brewer: 1
tickling: 1
bangs: 1
desk: 1
rung: 1
stab: 1
warily: 1
envier: 1
gamblers: 1
hraoui: 1
goulide: 1
aptidon: 1
realises: 1
mujahidi

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts_wordcloud.pkl']

In [143]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [145]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

with open(affect_path, 'rb') as f:
    affect_dict = pickle.load(f)
print("Contents of affect dictionary:")
print(affect_dict)
print("Number of words in affect dictionary:", len(affect_dict))

with open(cognition_path, 'rb') as f:
    cognition_dict = pickle.load(f)
print("Contents of cognition dictionary:")
print(cognition_dict)
print("Number of words in cognition dictionary:", len(cognition_dict))

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

a = [[i, word_counts_stemmed[i]] for i in affect if i in word_counts_stemmed]
c = [[i, word_counts_stemmed[i]] for i in cognition if i in word_counts_stemmed]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

In [146]:
# == Calculate weighted frequencies for all words

# - downweights very common words by giving more importance to rare ones
word_counts_stemmed = joblib.load(os.path.join(data_freq, 'word_counts_stemmed.pkl'))

l = sum(word_counts_stemmed.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts_stemmed.items()}
#for key in word_counts.keys():
 #   word_counts[key] = a / (a + (word_counts[key] / l))

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")


Top 100 words by weighted frequency:
fari: 0.999263471113337
napoleon: 0.999263471113337
kaiser: 0.999263471113337
koryo: 0.999263471113337
dètent: 0.999263471113337
junichiro: 0.999263471113337
riga: 0.999263471113337
techgirl: 0.999263471113337
tigray: 0.999263471113337
stumblingblock: 0.999263471113337
uncloud: 0.999263471113337
unexploit: 0.999263471113337
theoretician: 0.999263471113337
fecund: 0.999263471113337
durafour: 0.999263471113337
daid: 0.999263471113337
ttnite: 0.999263471113337
nauon: 0.999263471113337
shod: 0.999263471113337
irreduc: 0.999263471113337
egoist: 0.999263471113337
notat: 0.999263471113337
gunship: 0.999263471113337
numb: 0.999263471113337
dame: 0.999263471113337
nita: 0.999263471113337
douglass: 0.999263471113337
реор: 0.999263471113337
chlorin: 0.999263471113337
saddest: 0.999263471113337
recit: 0.999263471113337
hearth: 0.999263471113337
headed: 0.999263471113337
alleluia: 0.999263471113337
spoliat: 0.999263471113337
wegabon: 0.999263471113337
flour: 0.9

---

## Final Cleaning

In [149]:
os.chdir(data_c)
os.chdir(data_freq)

count = joblib.load('word_counts_stemmed.pkl')  # load stemmed counts

# For each speech only keep tokens that appear at least 10x

def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if count.get(w, 0) >= 10]
        lista[i] = [x, y]
    return lista

for data_path in preprocessed_files:
    data = joblib.load(data_path)
    data = select(data)
    cleaned_path = data_path.replace('.pkl', '_final.pkl')
    joblib.dump(data, cleaned_path)