# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Word Frequencies
### by Sarah Franzen

### Description: 
#### - Extract documents from their original txt documents and store them as one csv
#### - 


## Setup, Installation and Verification of required Packages and Libraries

In [264]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys
    !{sys.executable} -m pip install pandas
    !{sys.executable} -m pip install nltk
    !{sys.executable} -m pip install spacy
    !{sys.executable} -m pip install numpy
    !{sys.executable} -m pip install gensim
    !{sys.executable} -m pip install pycountry
    !{sys.executable} -m pip install wordcloud matplotlib

#########################
# Check if all packages are included
##########################

In [266]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time

from collections import Counter
from itertools import chain
from multiprocessing import Pool, freeze_support
# from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm
from wordcloud import WordCloud

# === Initialize NLP Tools ===

# Translator to remove punctuation
translator = str.maketrans('', '', punctuation)

# POS tagger (not used by SpaCy, but optionally available via NLTK)
tagger = nltk.perceptron.PerceptronTagger()

# Load SpaCy English model with unnecessary components disabled
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')

In [267]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


In [268]:
# Set DownloadAdditions to True if you need to download these additional resources.

DownloadAdditions = False
if DownloadAdditions:
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

## Load and Prepare Corpus

This chunk can be skipped at the moment

In [271]:
# == Load and Save Sample from UN General Debate Corpus ==             ######################## ADJUST LATER

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

#  Gather all relevant txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files, 1500)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

# Create DataFrame from the collected speeches
df_raw = pd.DataFrame(raw_data)

# Save df_raw as a pickle file for quick future loading
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Export df as CSV 
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n✅ Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")



✅ Saved raw data with 1500 speeches to '.\data\un_corpus_raw.csv'


In [272]:
# == Check if everything worked ==

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# View df to check structure
df_raw.head()         


Unnamed: 0,filename,speech
0,MWI_57_2002.txt,﻿On behalf of the\nGovernment of the Republic ...
1,IND_31_1976.txt,Let me begin by extending to the President my ...
2,BLR_23_1968.txt,24. The delegation of the Byelorussian Soviet ...
3,VUT_56_2001.txt,"﻿At the outset, I wish to\nextend to Mr. Han S..."
4,BHR_40_1985.txt,"Mr. President, in the name of His Highness Sha..."


In [273]:
# == Create new variables: year, country_code and country_name ==

# Extract country code (first 3 letters) and year (last 4 digits before .txt)
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

# Match country codes to country names
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YD":  "Yemen",
    "YMD": "Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "SUN": "Soviet Union",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
    
}

# Update the main mapping with custom names
code_to_name.update(custom_names)

# Map with updated dictionary
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# Check structure of the df
df_raw.head() 


Missing codes: []


Unnamed: 0,filename,speech,country_code,year,country_name
0,MWI_57_2002.txt,﻿On behalf of the\nGovernment of the Republic ...,MWI,2002,Malawi
1,IND_31_1976.txt,Let me begin by extending to the President my ...,IND,1976,India
2,BLR_23_1968.txt,24. The delegation of the Byelorussian Soviet ...,BLR,1968,Belarus
3,VUT_56_2001.txt,"﻿At the outset, I wish to\nextend to Mr. Han S...",VUT,2001,Vanuatu
4,BHR_40_1985.txt,"Mr. President, in the name of His Highness Sha...",BHR,1985,Bahrain


In [274]:
# == Check the country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            AND                               Andorra
4            ARE                  United Arab Emirates
5            ARG                             Argentina
6            ARM                               Armenia
7            ATG                   Antigua and Barbuda
8            AUS                             Australia
9            AUT                               Austria
10           AZE                            Azerbaijan
11           BDI                               Burundi
12           BEL                               Belgium
13           BEN                                 Benin
14           BFA                          Burkina Faso
15           BGD                            Bangladesh
16           BGR                              Bulgaria
17        

## Pre-processing

### Cleaning

In [280]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" → "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content

# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [281]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1_n.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2_n.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3_n.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4_n.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
data_files = [
    os.path.join(data_temp, 'cleanspeeches_indexed1_n.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed2_n.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed3_n.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed4_n.pkl')
]

print(f"✅ Saved clean speeches chunks in '{data_temp}'")

✅ Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

In [283]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

# Define sets for extra removals
ORDINALS = set([
    "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th",
    # ... fill up to "1000th" if needed or cover common ordinals
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth"
])

CARDINALS = set([
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    # Extend as needed up to "thousand"
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
    "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand"
])

MONTHS = set([
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
])

WEEKDAYS = set([
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"
])

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove digits
        return [[row[0], [w for w in row[1] if not any(char.isdigit() for char in w)]] for row in lista]

def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result

# Function to remove these from token lists
def remove_extra_stopwords(lista):
    combined_remove = ORDINALS | CARDINALS | MONTHS | WEEKDAYS
    return [
        [row[0], [w for w in row[1] if w.lower() not in combined_remove]]
        for row in lista
    ]

def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        lemmatized = [token.lemma_ for token in doc]
        result.append([lista[i][0], lemmatized])
    return result

########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [None]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = remove_extra_stopwords(data)
    data = pro5(data)
    data = pro6(data)
    data = dropnull(data)

    out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_')
    joblib.dump(data, out_name)

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in data_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1_n.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1_n.pkl] Before tagging: 4.77s


In [None]:
# Store the pre-processed data
preprocessed_files = [
    'preprocessed_speeches_indexed1_n.pkl',
    'preprocessed_speeches_indexed2_n.pkl',
    'preprocessed_speeches_indexed3_n.pkl',
    'preprocessed_speeches_indexed4_n.pkl'
]

## Word-Frequencies

In [None]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

word_counts = count_frequencies(preprocessed_files)
joblib.dump(freqs, r"path_to_save\word_counts.pkl")

# Print the top 100 most common words
print("Top 100 most common words:")
for word, count in word_counts.most_common(100):
    print(f"{word}: {count}")

In [None]:
# == Wordcloud with the most common words

wordcloud = WordCloud(
    width=1200,
    height=800,
    background_color='white',
    colormap='viridis',  # try 'plasma', 'cool', 'spring', etc.
    max_words=200,
    contour_color='steelblue',
    contour_width=2,
).generate_from_frequencies(word_counts)

# Print word cloud
plt.figure(figsize=(14, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words", fontsize=20)
plt.show()
