# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
## Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure all required packages are installed. If not, set `InstallPackages = TRUE` (see code cells below).  
- Set your working directory appropriately.  
- The script will automatically create the required folder structure.  
- Later, you will be asked to download the data from:  
  https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y  
  and store it **unzipped** inside the created folder *data_original*


### Description: 
- Extract documents from their original txt documents and store them as one csv
- Data Cleaning and Pre-Processing
- Count word frequencies and weight themhemnal

## Setup, Installation and Verification of required Packages and Libraries

In [75]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys

    packages = [
        "pandas",
        "nltk",
        "spacy",
        "numpy",
        "gensim",
        "pycountry",
        "wordcloud",
        "matplotlib",
        "tqdm"
    ]

    for package in packages:
        !{sys.executable} -m pip install {package}


DownloadAdditions = False # Set this to True to download these additional resources
if DownloadAdditions:
    nltk.download("punkt")
    nltk.download("averaged_perceptron_tagger")
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

#########################
# Check if all packages are included
##########################

In [77]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle
import numpy as np

from collections import Counter
from itertools import chain
#from matplotlib.colors import ListedColormap
from multiprocessing import Pool, freeze_support
# from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from pathlib import Path

# === Initialize NLP Tools ===

# Translator to remove punctuation
translator = str.maketrans('', '', punctuation)

# POS tagger (not used by SpaCy, but optionally available via NLTK)
tagger = nltk.perceptron.PerceptronTagger()

# Load SpaCy English model with unnecessary components disabled
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

stemmer = SnowballStemmer("english")

In [78]:
# === Set Working Directory and create folder structure ===

# Prompt user to enter working directory path
wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
try:
    os.chdir(wd)
    print(f"Working directory set to: {os.getcwd()}")
except FileNotFoundError:
    print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    exit(1)

# Set your working directory (adjust this as needed)
#wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
#os.chdir(wd)

# Set base path to current working directory
base_path = Path.cwd()
data_path = base_path / "data"

# List of subfolders to create inside 'data'
subfolders = ["data_original", "dictionaries", "freq", "preprocessed", "temp", "tokenized"]

# Create 'data' folder if it doesn't exist
data_path.mkdir(exist_ok=True)

# Create subfolders
for name in subfolders:
    (data_path / name).mkdir(exist_ok=True)

print("\nFolder structure created:")
print(f"- {data_path}")
for name in subfolders:
    print(f"  - {name}")

# Prompt user to place raw data files
print(f"\nPlease place your raw data files (unzipped) into the folder:\n  {data_path / 'data_original'}")
input("Press Enter after you have placed the files to continue...")

print("Continuing with the script...")


Please enter your working directory path (e.g., C:\Users\sarah\OneDrive\Dokumente\Masterarbeit):   C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


Working directory set to: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit

Folder structure created:
- C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data
  - data_original
  - dictionaries
  - freq
  - preprocessed
  - temp
  - tokenized

Please place your raw data files (unzipped) into the folder:
  C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\data_original


Press Enter after you have placed the files to continue... 


Continuing with the script...


In [87]:
# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
data_tokenized = os.path.join(data_c, 'tokenized')
fig = os.path.join(wd, 'fig')

In [89]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


## Load and Prepare Corpus

### This chunk can be skipped at the moment
### Think of proper header

In [93]:
# == Load and Save Sample from UN General Debate Corpus ==

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

#  Gather all relevant txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"üßæ Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,2000)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

# Create DataFrame from the collected speeches
df_raw = pd.DataFrame(raw_data)

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

# Save df_raw as a pickle file for quick future loading
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Export df as CSV 
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n‚úÖ Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


üßæ Total speeches found: 10761

‚úÖ Saved raw data with 1999 speeches to '.\data\un_corpus_raw.csv'


In [94]:
# == Check if everything worked ==

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# View df to check structure
df_raw.head()         


Unnamed: 0,filename,speech
0,USA_13_1958.txt,"16. Mr. President, I wish first of all to cong..."
1,YUG_38_1983.txt,"Ôªø76.\t Mr. President, for me and for my countr..."
2,YEM_23_1968.txt,"38. Mr. President, allow me first to extend th..."
3,ATG_70_2015.txt,"Democracy, peace, security, rule of law and re..."
4,SGP_21_1966.txt,"\n94. Mr. President, first of all, allow m..."


### Create new variables: year, country_code and country_name

In [96]:
# Extract country code (first 3 letters) and year (last 4 digits before .txt)
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())

# Match country codes to country names
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YMD": "Soth Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "SUN": "Soviet Union",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
    
}

# Update the main mapping with custom names
code_to_name.update(custom_names)

# Map with updated dictionary
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# Check structure of the df
df_raw.head() 

save_path = os.path.join(data_c, 'un_corpus_raw.pkl')
df_raw.to_pickle(save_path)
print(f"df_raw saved to {save_path}")

Min year: 1946
Max year: 2023
Missing codes: []
df_raw saved to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\un_corpus_raw.pkl


In [97]:
# == Check the country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            AND                               Andorra
4            ARE                  United Arab Emirates
5            ARG                             Argentina
6            ARM                               Armenia
7            ATG                   Antigua and Barbuda
8            AUS                             Australia
9            AUT                               Austria
10           AZE                            Azerbaijan
11           BDI                               Burundi
12           BEL                               Belgium
13           BEN                                 Benin
14           BFA                          Burkina Faso
15           BGD                            Bangladesh
16           BGR                              Bulgaria
17        

### Create variable speaker & position

In [99]:
os.chdir(data_c)
df_speakers = pd.read_excel(r"data_original\UN General Debate Corpus\Speakers_by_session.xlsx")

In [100]:
df_speakers.head()

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,Unnamed: 6
0,2023,78,BRA,Brazil,Luiz Inacio Lula da Silva,President,
1,2023,78,USA,United States of America,Joseph R. Biden,President,
2,2023,78,COL,Colombia,Gustavo Petro Urrego,President,
3,2023,78,JOR,Jordan,Abdullah II ibn Al Hussein,King,
4,2023,78,POL,Poland,Andrzej Duda,President,


In [101]:
df_raw.head()

Unnamed: 0,filename,speech,country_code,year,country_name
0,USA_13_1958.txt,"16. Mr. President, I wish first of all to cong...",USA,1958,United States
1,YUG_38_1983.txt,"Ôªø76.\t Mr. President, for me and for my countr...",YUG,1983,Yugoslavia
2,YEM_23_1968.txt,"38. Mr. President, allow me first to extend th...",YEM,1968,Yemen
3,ATG_70_2015.txt,"Democracy, peace, security, rule of law and re...",ATG,2015,Antigua and Barbuda
4,SGP_21_1966.txt,"\n94. Mr. President, first of all, allow m...",SGP,1966,Singapore


In [102]:
print(df_raw[(df_raw['country_code'] == 'MEX') & (df_raw['year'] == 1982)])


Empty DataFrame
Columns: [filename, speech, country_code, year, country_name]
Index: []


In [103]:
df_merged = df_raw.merge(
    df_speakers[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

df_merged.head()


Unnamed: 0,filename,speech,country_code,year,country_name,Year,ISO Code,Name of Person Speaking,Post,_merge
0,USA_13_1958.txt,"16. Mr. President, I wish first of all to cong...",USA,1958,United States,1958.0,USA,Mr. Dulles,,both
1,YUG_38_1983.txt,"Ôªø76.\t Mr. President, for me and for my countr...",YUG,1983,Yugoslavia,1983.0,YUG,SPILJAK,President,both
2,YEM_23_1968.txt,"38. Mr. President, allow me first to extend th...",YEM,1968,Yemen,1968.0,YEM,Mr. GEGHMAN,,both
3,ATG_70_2015.txt,"Democracy, peace, security, rule of law and re...",ATG,2015,Antigua and Barbuda,2015.0,ATG,Mr. Gaston Alphonso Browne,Prime minister,both
4,SGP_21_1966.txt,"\n94. Mr. President, first of all, allow m...",SGP,1966,Singapore,1966.0,SGP,Mr. COOMARASWAMY,,both


In [104]:
df_raw[(df_raw['year'] == 1962) & (df_raw['country_name'] == "Sierra Leone")]


Unnamed: 0,filename,speech,country_code,year,country_name


In [105]:
# Merge with indicator and set unmatched rows to NA
df_merged = df_raw.merge(
    df_speakers[['Year', 'ISO Code', 'Name of Person Speaking', 'Post']],
    left_on=['year', 'country_code'],
    right_on=['Year', 'ISO Code'],
    how='left',
    indicator=True)

# Get rows with no match in df_speakers
unmatched = df_merged[df_merged['_merge'] == 'left_only']

# Print unmatched rows with selected columns
print(unmatched[['filename', 'year', 'country_code', 'country_name']])

# Drop the '_merge' column from merged df
df_merged = df_merged.drop(columns=['Year', 'ISO Code', '_merge'])

# Rename columns
df_merged = df_merged.rename(columns={
    'Name of Person Speaking': 'speaker_name',
    'Post': 'position'})

             filename  year country_code    country_name
223   MDG_18_1963.txt  1963          MDG      Madagascar
357   IND_18_1963.txt  1963          IND           India
388   GAB_49_1994.txt  1994          GAB           Gabon
389   PRT_75_2020.txt  2020          PRT        Portugal
731   YMD_33_1978.txt  1978          YMD      Soth Yemen
770   EGY_18_1963.txt  1963          EGY           Egypt
803   GIN_34_1979.txt  1979          GIN          Guinea
934    EU_68_2013.txt  2013           EU  European Union
1065  RUS_37_1982.txt  1982          RUS          Russia
1115  YMD_40_1985.txt  1985          YMD      Soth Yemen
1259  YMD_34_1979.txt  1979          YMD      Soth Yemen
1371  EGY_17_1962.txt  1962          EGY           Egypt
1544  CUB_49_1994.txt  1994          CUB            Cuba
1567  MLI_18_1963.txt  1963          MLI            Mali
1591  YMD_28_1973.txt  1973          YMD      Soth Yemen
1823  AUS_49_1994.txt  1994          AUS       Australia


In [106]:
# Manually search for the speakers of the Missings

# Assign speaker names (or NA) based on country and year
df_merged.loc[
    (df_merged['country_name'] == 'European Union') & (df_merged['year'] == 2013),
    'speaker_name'
] = 'Mr. Herman Van Rompuy'

df_merged.loc[
    (df_merged['country_name'] == 'Sierra Leone') & (df_merged['year'] == 1962),
    'speaker_name'
] = np.nan


# No reliable resource found for Sierra Leone
# https://www.prnewswire.com/news-releases/eu-newsbrief-address-by-european-council-president-van-rompuy-to-the-un-general-assembly-225266212.html


In [107]:
df_merged.head()

Unnamed: 0,filename,speech,country_code,year,country_name,speaker_name,position
0,USA_13_1958.txt,"16. Mr. President, I wish first of all to cong...",USA,1958,United States,Mr. Dulles,
1,YUG_38_1983.txt,"Ôªø76.\t Mr. President, for me and for my countr...",YUG,1983,Yugoslavia,SPILJAK,President
2,YEM_23_1968.txt,"38. Mr. President, allow me first to extend th...",YEM,1968,Yemen,Mr. GEGHMAN,
3,ATG_70_2015.txt,"Democracy, peace, security, rule of law and re...",ATG,2015,Antigua and Barbuda,Mr. Gaston Alphonso Browne,Prime minister
4,SGP_21_1966.txt,"\n94. Mr. President, first of all, allow m...",SGP,1966,Singapore,Mr. COOMARASWAMY,


### Create variable 'Amtssprache'

In [109]:
# Source for english as official language : https://gradschool.utk.edu/future-students/office-of-graduate-admissions/applying-to-graduate-school/admission-requirements/testing-requirements/countries-with-english-as-official-language/
# They are quoting: https://www.cia.gov/the-world-factbook/field/languages/

english_countries = [
    "Anguilla", "Antigua and Barbuda", "Bahamas", "Barbados", "Belize", "Belgium",
    "Bermuda", "Botswana", "British Virgin Islands", "Burundi", "Cameroon", "Canada",
    "Cayman Islands", "Christmas Island", "Cook Islands", "Dominica", "Fiji", "Gambia",
    "Ghana", "Grenada", "Guyana", "Hong Kong", "India", "Ireland", "Jersey", "Kenya",
    "Liberia", "Malawi", "Malta", "Marshall Islands", "Micronesia",
    "Namibia", "New Zealand", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
    "Pakistan", "Palau", "Papua New Guinea", "Philippines", "Pitcairn Islands", "Rwanda",
    "Saint Kitts and Nevis", "Saint Lucia", "Samoa", "Seychelles", "Sierra Leone", "Singapore",
    "Sint Maarten", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Sudan",
    "Eswatini", "Tanzania", "Tonga", "Trinidad and Tobago", "Turks and Caicos Islands",
    "Tuvalu", "Uganda", "Zambia", "Zimbabwe"
]

# 2. Create dummy column
df_raw['englisch_official_language'] = df_raw['country_name'].apply(
    lambda x: 1 if x in english_countries else 0
)

# 3. Find countries in the list that did not match any entry in df_raw
matched = set(df_raw['country_name'])
unmatched = [country for country in english_countries if country not in matched]

# 4. Print unmatched country names
print("Countries not matched in df_raw['country_name']:")
for country in unmatched:
    print(country)

# All of these countries are either British Overseas Territories, Australian Territories, self-governing island territories or Special Administrative Regions
    # None of the unmatched regions are UN Members


Countries not matched in df_raw['country_name']:
Anguilla
Bermuda
British Virgin Islands
Cayman Islands
Christmas Island
Cook Islands
Hong Kong
Jersey
Niue
Norfolk Island
Northern Mariana Islands
Pitcairn Islands
Sint Maarten
Turks and Caicos Islands


In [110]:
df_raw.head()

Unnamed: 0,filename,speech,country_code,year,country_name,englisch_official_language
0,USA_13_1958.txt,"16. Mr. President, I wish first of all to cong...",USA,1958,United States,0
1,YUG_38_1983.txt,"Ôªø76.\t Mr. President, for me and for my countr...",YUG,1983,Yugoslavia,0
2,YEM_23_1968.txt,"38. Mr. President, allow me first to extend th...",YEM,1968,Yemen,0
3,ATG_70_2015.txt,"Democracy, peace, security, rule of law and re...",ATG,2015,Antigua and Barbuda,1
4,SGP_21_1966.txt,"\n94. Mr. President, first of all, allow m...",SGP,1966,Singapore,1


## Create variable for permanent member security council

In [135]:
pd.set_option('display.max_rows', None)  # show all rows
print(df_merged[df_merged['country_code'].isin(['SUN', 'RUS'])])


             filename                                             speech  \
38    RUS_09_1954.txt  In accordance with its practice, the General A...   
110   RUS_13_1958.txt  1.\tSessions of the United Nations General Ass...   
111   RUS_13_1958.txt  1.\tSessions of the United Nations General Ass...   
412   RUS_23_1968.txt  56. Allow me, Mr. President, on behalf of the ...   
598   RUS_43_1988.txt  Ôªø\nAs my first duty, permit me, Sir, to congra...   
630   RUS_62_2007.txt  Traditionally, the General Assembly sums up \n...   
839   RUS_61_2006.txt  Traditionally, the General Assembly session \n...   
879   RUS_69_2014.txt  There is growing evidence today of a contradic...   
1065  RUS_37_1982.txt  Mr. Hollai, allow me to congratulate you on th...   
1096  RUS_48_1993.txt  First of all, I should like to congratulate yo...   
1135  RUS_73_2018.txt  The statements delivered during this session‚Äôs...   
1318  RUS_16_1961.txt  I take this opportunity, Mr. President, to con...   
1353  RU

In [113]:
pd.set_option('display.max_rows', None)  # show all rows
print(df_raw)
pd.reset_option('display.max_rows')     # reset to default after

             filename                                             speech  \
0     USA_13_1958.txt  16. Mr. President, I wish first of all to cong...   
1     YUG_38_1983.txt  Ôªø76.\t Mr. President, for me and for my countr...   
2     YEM_23_1968.txt  38. Mr. President, allow me first to extend th...   
3     ATG_70_2015.txt  Democracy, peace, security, rule of law and re...   
4     SGP_21_1966.txt  \n94.     Mr. President, first of all, allow m...   
5     MRT_53_1998.txt  Allow me first of all, on behalf of my country...   
6     COG_57_2002.txt  ÔªøThis\nsession of the General Assembly is begi...   
7     GBR_24_1969.txt  64. Madam President,\nit gives me great pleasu...   
8     AUT_33_1978.txt  Ôªø153.\tIt gives me great pleasure to congratul...   
9     DOM_02_1947.txt  The voice of the Dominican Republic, which is ...   
10    SAU_32_1977.txt  Ôªø \n\n1.\tMr. President, on behalf of the dele...   
11    SWE_58_2003.txt  ÔªøIn the early morning of\n11 September 2003, o...   
12

## Pre-processing

### Cleaning

In [151]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" ‚Üí "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" ‚Üí "international")
    content = re.sub(r'-\s', '', content)

    ############NEW

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" ‚Üí "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)
    #################NEW

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content

# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [152]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
data_files = [
    os.path.join(data_temp, 'cleanspeeches_indexed1.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed2.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed3.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed4.pkl')
]

print(f"‚úÖ Saved clean speeches chunks in '{data_temp}'")

‚úÖ Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

In [154]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove digits
        return [[row[0], [w for w in row[1] if not any(char.isdigit() for char in w)]] for row in lista]

def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result


def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
      return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
   # texts = [' '.join(row[1]) for row in lista]
   # docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
   # result = []
   # for i, doc in enumerate(docs):
    # lemmatized = [token.lemma_ for token in doc]
     #    result.append([lista[i][0], lemmatized])
  #  return result

########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [155]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
   # data = pro6(data)
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # original filename replacement
    filename_wordcloud = data_name.replace('cleanspeeches_', 'wordcloud_speeches_').replace('.pkl', '.pkl')

    # full path in data_preprocessed folder
    out_name_wordcloud = os.path.join(data_preprocessed, os.path.basename(filename_wordcloud))

  #  joblib.dump(data, out_name)
    joblib.dump(data, out_name_wordcloud)

    data_stemmed = pro6(data)

    filename_preprocessed = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data_stemmed, out_preprocessed)
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in data_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Before tagging: 24.05s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] After tagging: 465.79s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Done. Total time: 546.51s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] Before tagging: 21.03s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] After tagging: 445.62s
[C:\Users\sara

In [156]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

wordcloud_files = [
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed4.pkl')
]

In [157]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [160]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

def remove_rare_words(filenames, freqs, min_count=10):
    for fname in filenames:
        data = joblib.load(fname)
        filtered_data = []
        for doc_id, tokens in data:
            filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
            filtered_data.append([doc_id, filtered_tokens])
        joblib.dump(filtered_data, fname)  # overwrite or save as new file
        print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts_stemmed = count_frequencies(preprocessed_files)

remove_rare_words(preprocessed_files, word_counts_stemmed, min_count=10)

print("\n[Stemmed] Top 100 most common words:")
for word, count in word_counts_stemmed.most_common(100):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 300 least common words:")
for word, count in word_counts_stemmed.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path_stemmed = os.path.join(data_freq, 'word_counts_stemmed.pkl')
joblib.dump(word_counts_stemmed, save_path_stemmed)

# === Count for wordcloud (unstemmed) speeches ===
word_counts_wordcloud = count_frequencies(wordcloud_files)

print("\n[Wordcloud] Top 100 most common words:")
for word, count in word_counts_wordcloud.most_common(100):
    print(f"{word}: {count}")

print("\n[Wordcloud] Top 300 least common words:")
for word, count in word_counts_wordcloud.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save unstemmed word counts
save_path_wordcloud = os.path.join(data_freq, 'word_counts_wordcloud.pkl')
joblib.dump(word_counts_wordcloud, save_path_wordcloud)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:29<00:00,  7.28s/it]


Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed2.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed3.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed4.pkl: removed words with freq < 10

[Stemmed] Top 100 most common words:
nation: 232332
unit: 186036
countri: 176251
intern: 159425
develop: 145364
peac: 133382
world: 131630
state: 128681
peopl: 126148
secur: 84579
general: 76542
govern: 74954
econom: 72977
organ: 68141
right: 65874
year: 65829
assembl: 63469
new: 58990
effort: 56971
problem: 56698
human: 56072
support: 54788
continu: 53001
communiti: 48791
region: 48258
polit: 48075
time: 47283
member: 42574
africa: 42

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:29<00:00,  7.34s/it]



[Wordcloud] Top 100 most common words:
nations: 185541
united: 183294
international: 152545
world: 128865
countries: 117093
peace: 110332
states: 90068
development: 85218
people: 80324
security: 78421
general: 74810
economic: 72599
assembly: 62091
new: 58990
country: 57215
government: 56448
organization: 50483
human: 48889
efforts: 46335
political: 45844
peoples: 45773
community: 44912
rights: 44034
support: 40749
africa: 40620
council: 39845
session: 39784
time: 38904
war: 36554
state: 34415
south: 34165
great: 33644
problems: 33302
republic: 33286
years: 33171
national: 33042
year: 32566
order: 32067
nuclear: 32003
developing: 31781
situation: 31337
global: 30966
work: 30014
social: 29765
conference: 28324
hope: 27770
charter: 27624
president: 27511
today: 26394
continue: 26269
important: 26160
need: 25962
region: 25912
african: 25701
relations: 25428
progress: 25116
principles: 24764
east: 24565
action: 24210
respect: 23983
weapons: 23796
problem: 23396
future: 23232
secretary: 232

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts_wordcloud.pkl']

In [161]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [163]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

with open(affect_path, 'rb') as f:
    affect_dict = pickle.load(f)
print("Contents of affect dictionary:")
print(affect_dict)
print("Number of words in affect dictionary:", len(affect_dict))

with open(cognition_path, 'rb') as f:
    cognition_dict = pickle.load(f)
print("Contents of cognition dictionary:")
print(cognition_dict)
print("Number of words in cognition dictionary:", len(cognition_dict))

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

a = [[i, word_counts_stemmed[i]] for i in affect if i in word_counts_stemmed]
c = [[i, word_counts_stemmed[i]] for i in cognition if i in word_counts_stemmed]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

In [167]:
# == Calculate weighted frequencies for all words

 # STEMMED OR NOT?
# - downweights very common words by giving more importance to rare ones
word_counts_stemmed = joblib.load(os.path.join(data_freq, 'word_counts_stemmed.pkl'))

l = sum(word_counts_stemmed.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}
#for key in word_counts.keys():
 #   word_counts[key] = a / (a + (word_counts[key] / l))

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")


Top 100 words by weighted frequency:
disloy: 0.9993014671495019
prick: 0.9993014671495019
litr: 0.9993014671495019
chen: 0.9993014671495019
dualism: 0.9993014671495019
praia: 0.9993014671495019
lexicon: 0.9993014671495019
insuffer: 0.9993014671495019
siad: 0.9993014671495019
nothing: 0.9993014671495019
neat: 0.9993014671495019
sojourn: 0.9993014671495019
therefor: 0.9993014671495019
cento: 0.9993014671495019
lish: 0.9993014671495019
pyrrhic: 0.9993014671495019
tenth: 0.9993014671495019
mondlan: 0.9993014671495019
resolu: 0.9993014671495019
tive: 0.9993014671495019
kiloton: 0.9993014671495019
atyp: 0.9993014671495019
dan: 0.9993014671495019
midrand: 0.9993014671495019
mahamadou: 0.9993014671495019
shrank: 0.9993014671495019
unorthodox: 0.9993014671495019
foci: 0.9993014671495019
finer: 0.9993014671495019
gust: 0.9993014671495019
underworld: 0.9993014671495019
bluster: 0.9993014671495019
sneer: 0.9993014671495019
shirt: 0.9993014671495019
stealth: 0.9993014671495019
brigand: 0.9993014671