# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
## Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- Set your working directory appropriately.  
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the data from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Data Cleaning and Pre-Processing
- Count word frequencies and weight themhemnal

## Setup, Installation and Verification of required Packages and Libraries

In [155]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "wordcloud",
        "matplotlib",
        "tqdm"
    ]

    for package in packages:
        !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

#########################
# Check if all packages are included
##########################

In [157]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
#from matplotlib.colors import ListedColormap
from multiprocessing import Pool, freeze_support
# from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path

# === Initialize NLP Tools ===

# Translator to remove punctuation
translator = str.maketrans('', '', punctuation)

# POS tagger (not used by SpaCy, but optionally available via NLTK)
tagger = nltk.perceptron.PerceptronTagger()

# Load SpaCy English model with unnecessary components disabled
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

stemmer = SnowballStemmer("english")

In [159]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp", "tokenized"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")

# Prompt user to place raw data files
#print(f"\nPlease place your raw data files (unzipped) into the folder:\n  {data_path / 'data_original'}")
#input("Press Enter after you have placed the files to continue...")

#print("Continuing with the script...")



Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
  - tokenized


In [161]:
# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
data_tokenized = os.path.join(data_c, 'tokenized')
fig = os.path.join(wd, 'fig')

In [163]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


## Load and Prepare Corpus

### This chunk can be skipped at the moment
### Think of proper header

In [167]:
# == Load and Save Sample from UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

#  Gather all relevant txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"ðŸ§¾ Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,800)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

# Create DataFrame from the collected speeches
df_raw = pd.DataFrame(raw_data)

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

# Save df_raw as a pickle file for quick future loading
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Export df as CSV 
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\nâœ… Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


ðŸ§¾ Total speeches found: 10761

âœ… Saved raw data with 800 speeches to '.\data\un_corpus_raw.csv'


In [168]:
# == Check if everything worked & drop empty speeches ==

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# Drop empty speeches
df_raw['speech'] = df_raw['speech'].astype(str)
df_raw = df_raw[df_raw['speech'].str.strip() != ''].copy()

# View df to check structure
df_raw.head()         


Unnamed: 0,filename,speech
0,WSM_72_2017.txt,Samoa warmly welcomes the assumption of Mr. Mi...
1,KGZ_76_2021.txt,"Mr. President, Mr. Secretary-General, ladies a..."
2,MCO_70_2015.txt,The successful outcome of the United Nations S...
3,BGD_53_1998.txt,"May I convey to you, Sir,\non behalf of my del..."
4,ISL_28_1973.txt,"ï»¿111.\t Mr. President, at the outset permit me..."


### Length of raw speeches

In [232]:
# Add a new column: speech length in words
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(str(x).split()))


In [236]:
# Count words in each speech
df_raw['speech_length_words'] = df_raw['speech'].apply(lambda x: len(x.split()))

# Calculate average length
avg_length = df_raw['speech_length_words'].mean()

# Print it
print("Average speech length (words):", round(avg_length, 2))

# 20 shortest speeches
print("20 shortest speeches:")
print(df_raw.nsmallest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

# 20 longest speeches
print("\n20 longest speeches:")
print(df_raw.nlargest(20, 'speech_length_words')[['filename', 'country_name', 'year', 'speech_length_words']])

20 shortest speeches:
            filename                          country_name  year  \
731  GNB_76_2021.txt                         Guinea-Bissau  2021   
13   SYC_52_1997.txt                            Seychelles  1997   
391  UGA_69_2014.txt                                Uganda  2014   
382  GHA_60_2005.txt                                 Ghana  2005   
379  BRN_62_2007.txt                     Brunei Darussalam  2007   
738  RWA_78_2023.txt                                Rwanda  2023   
270  CZE_71_2016.txt                               Czechia  2016   
477  UZB_71_2016.txt                            Uzbekistan  2016   
93   LVA_55_2000.txt                                Latvia  2000   
130  YEM_28_1973.txt                                 Yemen  1973   
468  COD_67_2012.txt  The Democratic Republic of the Congo  2012   
634  LVA_68_2013.txt                                Latvia  2013   
542  UZB_69_2014.txt                            Uzbekistan  2014   
153  USA_75_2020.txt      

Average speech length (words): 2922.5


### Create new variables: year, country_code and country_name

In [172]:
# Extract country code (first 3 letters) and year (last 4 digits before .txt)
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())

# Match country codes to country names
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "South Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    # "SUN": "Soviet Union",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
    
}

# Update the main mapping with custom names
code_to_name.update(custom_names)

# Map with updated dictionary
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# Check structure of the df
df_raw.head() 

save_path = os.path.join(data_c, 'un_corpus_raw.pkl')
df_raw.to_pickle(save_path)
print(f"df_raw saved to {save_path}")

Min year: 1948
Max year: 2023
Missing codes: []
df_raw saved to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\un_corpus_raw.pkl


In [174]:
# == Check the country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            AND                               Andorra
4            ARE                  United Arab Emirates
5            ARG                             Argentina
6            ARM                               Armenia
7            ATG                   Antigua and Barbuda
8            AUS                             Australia
9            AUT                               Austria
10           AZE                            Azerbaijan
11           BDI                               Burundi
12           BEL                               Belgium
13           BEN                                 Benin
14           BFA                          Burkina Faso
15           BGD                            Bangladesh
16           BGR                              Bulgaria
17        

### Create variable speaker & position

In [177]:
os.chdir(data_c)
df_speakers = pd.read_excel(r"data_original\UN General Debate Corpus\Speakers_by_session.xlsx")

In [178]:
df_speakers.head()

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,Unnamed: 6
0,2023,78,BRA,Brazil,Luiz Inacio Lula da Silva,President,
1,2023,78,USA,United States of America,Joseph R. Biden,President,
2,2023,78,COL,Colombia,Gustavo Petro Urrego,President,
3,2023,78,JOR,Jordan,Abdullah II ibn Al Hussein,King,
4,2023,78,POL,Poland,Andrzej Duda,President,


In [179]:
df_raw.head()

Unnamed: 0,filename,speech,country_code,year,country_name
0,WSM_72_2017.txt,Samoa warmly welcomes the assumption of Mr. Mi...,WSM,2017,Samoa
1,KGZ_76_2021.txt,"Mr. President, Mr. Secretary-General, ladies a...",KGZ,2021,Kyrgyzstan
2,MCO_70_2015.txt,The successful outcome of the United Nations S...,MCO,2015,Monaco
3,BGD_53_1998.txt,"May I convey to you, Sir,\non behalf of my del...",BGD,1998,Bangladesh
4,ISL_28_1973.txt,"ï»¿111.\t Mr. President, at the outset permit me...",ISL,1973,Iceland


In [183]:
print(df_raw[(df_raw['country_code'] == 'MEX') & (df_raw['year'] == 1982)])


Empty DataFrame
Columns: [filename, speech, country_code, year, country_name]
Index: []


In [185]:
df_merged = df_raw.merge(
    df_speakers[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

df_merged.head()


Unnamed: 0,filename,speech,country_code,year,country_name,Year,ISO Code,Name of Person Speaking,Post,_merge
0,WSM_72_2017.txt,Samoa warmly welcomes the assumption of Mr. Mi...,WSM,2017,Samoa,2017.0,WSM,Mr. Tuilaepa Sailele Malielegaoi,Prime Minister and Minister for Foreign Affair...,both
1,KGZ_76_2021.txt,"Mr. President, Mr. Secretary-General, ladies a...",KGZ,2021,Kyrgyzstan,2021.0,KGZ,Sadyr Japarov,President,both
2,MCO_70_2015.txt,The successful outcome of the United Nations S...,MCO,2015,Monaco,2015.0,MCO,Mr. Gilles Tonelli,Minister for Foreign Affairs,both
3,BGD_53_1998.txt,"May I convey to you, Sir,\non behalf of my del...",BGD,1998,Bangladesh,1998.0,BGD,Abdus Samad Azad,Minister for Foreign Affairs,both
4,ISL_28_1973.txt,"ï»¿111.\t Mr. President, at the outset permit me...",ISL,1973,Iceland,1973.0,ISL,Agustsson,,both


In [187]:
df_raw[(df_raw['year'] == 1962) & (df_raw['country_name'] == "Sierra Leone")]


Unnamed: 0,filename,speech,country_code,year,country_name
164,SLE_17_1962.txt,34. Almost exactly a year ago today my Prime M...,SLE,1962,Sierra Leone


In [189]:
# Merge with indicator and set unmatched rows to NA
df_merged = df_raw.merge(
    df_speakers[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

# Get rows with no match in df_speakers
unmatched = df_merged[df_merged['_merge'] == 'left_only']

# Print unmatched rows with selected columns (panda sets them to NA by default)
print(unmatched[['filename', 'year', 'country_code', 'country_name']])


# Drop the '_merge' column from merged df
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge'])

# Rename columns
df_merged = df_merged.rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'})

            filename  year country_code              country_name
11   GAB_49_1994.txt  1994          GAB                     Gabon
147  CAF_18_1963.txt  1963          CAF  Central African Republic
428  MOZ_49_1994.txt  1994          MOZ                Mozambique
538  DOM_18_1963.txt  1963          DOM        Dominican Republic
592  ECU_18_1963.txt  1963          ECU                   Ecuador
663  UKR_18_1963.txt  1963          UKR                   Ukraine
712  YMD_39_1984.txt  1984          YMD                Soth Yemen


In [191]:
# Manually search for the speakers of the Missings

# Assign speaker names (or NA) based on country and year
df_merged.loc[
    (df_merged['country_name'] == 'European Union') & (df_merged['year'] == 2013),
    'speaker_name'
] = 'Mr. Herman Van Rompuy'

df_merged.loc[
    (df_merged['country_name'] == 'Sierra Leone') & (df_merged['year'] == 1962),
    'speaker_name'
] = np.nan


# No reliable resource found for Sierra Leone
# https://www.prnewswire.com/news-releases/eu-newsbrief-address-by-european-council-president-van-rompuy-to-the-un-general-assembly-225266212.html


In [193]:
df_merged.head()

Unnamed: 0,filename,speech,country_code,year,country_name,speaker_name,position
0,WSM_72_2017.txt,Samoa warmly welcomes the assumption of Mr. Mi...,WSM,2017,Samoa,Mr. Tuilaepa Sailele Malielegaoi,Prime Minister and Minister for Foreign Affair...
1,KGZ_76_2021.txt,"Mr. President, Mr. Secretary-General, ladies a...",KGZ,2021,Kyrgyzstan,Sadyr Japarov,President
2,MCO_70_2015.txt,The successful outcome of the United Nations S...,MCO,2015,Monaco,Mr. Gilles Tonelli,Minister for Foreign Affairs
3,BGD_53_1998.txt,"May I convey to you, Sir,\non behalf of my del...",BGD,1998,Bangladesh,Abdus Samad Azad,Minister for Foreign Affairs
4,ISL_28_1973.txt,"ï»¿111.\t Mr. President, at the outset permit me...",ISL,1973,Iceland,Agustsson,


### Create variable 'Amtssprache'

In [196]:
# Source for english as official language : https://gradschool.utk.edu/future-students/office-of-graduate-admissions/applying-to-graduate-school/admission-requirements/testing-requirements/countries-with-english-as-official-language/
# They are quoting: https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

# 2. Create dummy column
df_raw['englisch_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

# 3. Find countries in the list that did not match any entry in df_raw
matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

# 4. Print unmatched country names
print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members


Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Dominica
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
South Sudan
Turks and Caicos Islands


In [198]:
df_raw.head()

Unnamed: 0,filename,speech,country_code,year,country_name,englisch_official_language
0,WSM_72_2017.txt,Samoa warmly welcomes the assumption of Mr. Mi...,WSM,2017,Samoa,1
1,KGZ_76_2021.txt,"Mr. President, Mr. Secretary-General, ladies a...",KGZ,2021,Kyrgyzstan,0
2,MCO_70_2015.txt,The successful outcome of the United Nations S...,MCO,2015,Monaco,0
3,BGD_53_1998.txt,"May I convey to you, Sir,\non behalf of my del...",BGD,1998,Bangladesh,0
4,ISL_28_1973.txt,"ï»¿111.\t Mr. President, at the outset permit me...",ISL,1973,Iceland,0


## Create variable for permanent member security council

In [201]:
# Define permanent members of the UN Security Council
permanent_members = ['RUS', 'USA', 'FRA', 'GBR', 'CHN']

# Create dummy variable
df_merged['security_council_permanent'] = df_merged['country_code'].isin(permanent_members).astype(int)


In [203]:
print(df_merged[df_merged['country_code'].isin(permanent_members)][
    ['country_code', 'country_name', 'security_council_permanent', 'year']
])

    country_code    country_name  security_council_permanent  year
28           FRA          France                           1  1948
48           CHN           China                           1  2000
76           FRA          France                           1  1970
86           FRA          France                           1  1952
119          FRA          France                           1  1949
153          USA   United States                           1  2020
178          USA   United States                           1  1949
184          USA   United States                           1  1987
205          GBR  United Kingdom                           1  1993
233          USA   United States                           1  1982
249          RUS          Russia                           1  1953
253          USA   United States                           1  2017
257          CHN           China                           1  1949
314          RUS          Russia                           1  

In [230]:
os.chdir(wd)

# Save df_merged as a pickle file for quick future loading
merged_pickle_path = r".\data\un_corpus_merged.pkl"
df_merged.to_pickle(merged_pickle_path)

# Export df as CSV 
merged_output_path = r".\data\un_corpus_raw.csv"
df_merged.to_csv(merged_output_path, index=False, sep=';', encoding='utf-8')

## Pre-processing

### Cleaning

In [207]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" â†’ "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" â†’ "international")
    content = re.sub(r'-\s', '', content)

    ############NEW

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" â†’ "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)
    #################NEW

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content

# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [208]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
data_files = [
    os.path.join(data_temp, 'cleanspeeches_indexed1.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed2.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed3.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed4.pkl')
]

print(f"âœ… Saved clean speeches chunks in '{data_temp}'")

âœ… Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

In [212]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove digits
        return [[row[0], [w for w in row[1] if not any(char.isdigit() for char in w)]] for row in lista]

def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result


def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
      return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
   # texts = [' '.join(row[1]) for row in lista]
   # docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
   # result = []
   # for i, doc in enumerate(docs):
    # lemmatized = [token.lemma_ for token in doc]
     #    result.append([lista[i][0], lemmatized])
  #  return result

########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [214]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
   # data = pro6(data)
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # Store preprocessed corupus (before stemming) for wordcloud
    filename_wordcloud = data_name.replace('cleanspeeches_', 'wordcloud_speeches_').replace('.pkl', '.pkl')
    out_name_wordcloud = os.path.join(data_preprocessed, os.path.basename(filename_wordcloud))
    joblib.dump(data, out_name_wordcloud)

    # Apply stemming
    data_stemmed = pro6(data)

    filename_preprocessed = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data_stemmed, out_preprocessed
               )
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in data_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Before tagging: 1.42s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] After tagging: 30.83s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Done. Total time: 36.52s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] Before tagging: 1.49s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] After tagging: 29.44s
[C:\Users\sarah\One

In [215]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

wordcloud_files = [
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed4.pkl')
]

In [216]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [219]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

def remove_rare_words(filenames, freqs, min_count=10):
    for fname in filenames:
        data = joblib.load(fname)
        filtered_data = []
        for doc_id, tokens in data:
            filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
            filtered_data.append([doc_id, filtered_tokens])
        joblib.dump(filtered_data, fname)  # overwrite or save as new file
        print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts_stemmed = count_frequencies(preprocessed_files)

remove_rare_words(preprocessed_files, word_counts_stemmed, min_count=10)

print("\n[Stemmed] Top 100 most common words:")
for word, count in word_counts_stemmed.most_common(100):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 300 least common words:")
for word, count in word_counts_stemmed.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path_stemmed = os.path.join(data_freq, 'word_counts_stemmed.pkl')
joblib.dump(word_counts_stemmed, save_path_stemmed)

# === Count for wordcloud (unstemmed) speeches ===
word_counts_wordcloud = count_frequencies(wordcloud_files)

print("\n[Wordcloud] Top 100 most common words:")
for word, count in word_counts_wordcloud.most_common(100):
    print(f"{word}: {count}")

print("\n[Wordcloud] Top 300 least common words:")
for word, count in word_counts_wordcloud.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save unstemmed word counts
save_path_wordcloud = os.path.join(data_freq, 'word_counts_wordcloud.pkl')
joblib.dump(word_counts_wordcloud, save_path_wordcloud)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:01<00:00,  2.11it/s]


Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed2.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed3.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed4.pkl: removed words with freq < 10

[Stemmed] Top 100 most common words:
nation: 16885
unit: 13698
countri: 12968
intern: 11888
develop: 10683
world: 9676
state: 9653
peac: 9587
peopl: 9054
secur: 6133
general: 5818
govern: 5510
econom: 5508
year: 5039
organ: 4989
assembl: 4716
right: 4668
effort: 4304
new: 4287
problem: 4280
human: 4114
support: 4098
continu: 4012
polit: 3712
communiti: 3702
region: 3601
time: 3418
africa: 3418
member: 3180
session: 3117
import: 3034
wo

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:01<00:00,  2.02it/s]


[Wordcloud] Top 100 most common words:
nations: 13558
united: 13502
international: 11341
world: 9473
countries: 8560
peace: 7860
states: 6757
development: 6213
people: 5820
general: 5698
security: 5673
economic: 5485
assembly: 4628
new: 4287
country: 4266
government: 4196
organization: 3677
human: 3573
political: 3559
efforts: 3495
community: 3417
africa: 3277
peoples: 3230
rights: 3135
support: 3035
session: 2970
council: 2970
time: 2807
south: 2777
years: 2585
republic: 2561
war: 2542
state: 2538
problems: 2532
year: 2442
great: 2434
order: 2405
national: 2336
global: 2330
developing: 2324
situation: 2277
social: 2274
work: 2246
president: 2170
nuclear: 2169
conference: 2152
hope: 2100
charter: 2024
continue: 1970
african: 1947
important: 1931
region: 1899
today: 1869
secretary: 1852
relations: 1846
need: 1840
east: 1838
progress: 1833
respect: 1831
problem: 1748
policy: 1727
peaceful: 1726
future: 1720
weapons: 1710
process: 1706
disarmament: 1701
principles: 1700
action: 1697
solu




['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts_wordcloud.pkl']

In [220]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [222]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

with open(affect_path, 'rb') as f:
    affect_dict = pickle.load(f)
print("Contents of affect dictionary:")
print(affect_dict)
print("Number of words in affect dictionary:", len(affect_dict))

with open(cognition_path, 'rb') as f:
    cognition_dict = pickle.load(f)
print("Contents of cognition dictionary:")
print(cognition_dict)
print("Number of words in cognition dictionary:", len(cognition_dict))

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

a = [[i, word_counts_stemmed[i]] for i in affect if i in word_counts_stemmed]
c = [[i, word_counts_stemmed[i]] for i in cognition if i in word_counts_stemmed]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

In [224]:
# == Calculate weighted frequencies for all words

 # STEMMED OR NOT?
# - downweights very common words by giving more importance to rare ones
word_counts_stemmed = joblib.load(os.path.join(data_freq, 'word_counts_stemmed.pkl'))

l = sum(word_counts_stemmed.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts_stemmed.items()}
#for key in word_counts.keys():
 #   word_counts[key] = a / (a + (word_counts[key] / l))

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")


Top 100 words by weighted frequency:
eb: 0.9990577868732752
nudg: 0.9990577868732752
bounc: 0.9990577868732752
twentythird: 0.9990577868732752
thse: 0.9990577868732752
biketawa: 0.9990577868732752
epidemiolog: 0.9990577868732752
karachi: 0.9990577868732752
bandar: 0.9990577868732752
multimod: 0.9990577868732752
antivir: 0.9990577868732752
palmyra: 0.9990577868732752
monegasqu: 0.9990577868732752
oceanograph: 0.9990577868732752
omnibus: 0.9990577868732752
brace: 0.9990577868732752
minster: 0.9990577868732752
minsterÃ¢: 0.9990577868732752
womenÃ¢: 0.9990577868732752
communiquÃ£: 0.9990577868732752
bangladeshÃ¢: 0.9990577868732752
waje: 0.9990577868732752
sunset: 0.9990577868732752
movementÃ¢: 0.9990577868732752
liv: 0.9990577868732752
trawl: 0.9990577868732752
licenc: 0.9990577868732752
toussaint: 0.9990577868732752
louvertur: 0.9990577868732752
plautus: 0.9990577868732752
solon: 0.9990577868732752
pittacus: 0.9990577868732752
ephialt: 0.9990577868732752
cleisthen: 0.9990577868732752
sub