# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
### by Sarah Franzen

### Description: 
#### - Extract documents from their original txt documents and store them as one csv
#### - Data Cleaning and Pre-Processing
#### - Count word frequencies and weight them


## Setup, Installation and Verification of required Packages and Libraries

In [51]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys
    !{sys.executable} -m pip install pandas
    !{sys.executable} -m pip install nltk
    !{sys.executable} -m pip install spacy
    !{sys.executable} -m pip install numpy
    !{sys.executable} -m pip install gensim
    !{sys.executable} -m pip install pycountry
    !{sys.executable} -m pip install wordcloud matplotlib

#########################
# Check if all packages are included
##########################

In [53]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time

from collections import Counter
from itertools import chain
#from matplotlib.colors import ListedColormap
from multiprocessing import Pool, freeze_support
# from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm

# === Initialize NLP Tools ===

# Translator to remove punctuation
translator = str.maketrans('', '', punctuation)

# POS tagger (not used by SpaCy, but optionally available via NLTK)
tagger = nltk.perceptron.PerceptronTagger()

# Load SpaCy English model with unnecessary components disabled
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
fig = os.path.join(wd, 'Code/0_data_preparation_descriptives/fig')

In [54]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


In [55]:
# Set DownloadAdditions to True if you need to download these additional resources.

DownloadAdditions = False
if DownloadAdditions:
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

## Load and Prepare Corpus

This chunk can be skipped at the moment

In [58]:
# == Load and Save Sample from UN General Debate Corpus ==             ######################## ADJUST LATER

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

#  Gather all relevant txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files, 500)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

# Create DataFrame from the collected speeches
df_raw = pd.DataFrame(raw_data)

# Save df_raw as a pickle file for quick future loading
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Export df as CSV 
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n✅ Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")



✅ Saved raw data with 500 speeches to '.\data\un_corpus_raw.csv'


In [59]:
# == Check if everything worked ==

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# View df to check structure
df_raw.head()         


Unnamed: 0,filename,speech
0,PAN_66_2011.txt,It is an honour for me to address the \nintern...
1,CMR_78_2023.txt,The President of the Republic of Cameroon. His...
2,MYS_38_1983.txt,"﻿100.\t At the outset, may I, on behalf of the..."
3,MRT_54_1999.txt,"At the outset, I wish, on behalf of the Islami..."
4,NER_41_1986.txt,"On 24 October 1985, the community of nations c..."


In [97]:
# == Create new variables: year, country_code and country_name ==

# Extract country code (first 3 letters) and year (last 4 digits before .txt)
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

# Match country codes to country names
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    "YD":  "Yemen",
    "YMD": "Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "SUN": "Soviet Union",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
    
}

# Update the main mapping with custom names
code_to_name.update(custom_names)

# Map with updated dictionary
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# Check structure of the df
df_raw.head() 

save_path = os.path.join(data_c, 'un_corpus_raw.pkl')
df_raw.to_pickle(save_path)
print(f"df_raw saved to {save_path}")

Missing codes: []
df_raw saved to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\un_corpus_raw.pkl


In [99]:
# == Check the country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            ARE                  United Arab Emirates
4            ARG                             Argentina
5            ARM                               Armenia
6            ATG                   Antigua and Barbuda
7            AUS                             Australia
8            AUT                               Austria
9            AZE                            Azerbaijan
10           BDI                               Burundi
11           BEL                               Belgium
12           BEN                                 Benin
13           BFA                          Burkina Faso
14           BGD                            Bangladesh
15           BGR                              Bulgaria
16           BHR                               Bahrain
17        

## Pre-processing

### Cleaning

In [67]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" → "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    ############NEW

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" → "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)
    #################NEW

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content

# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [68]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
data_files = [
    os.path.join(data_temp, 'cleanspeeches_indexed1.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed2.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed3.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed4.pkl')
]

print(f"✅ Saved clean speeches chunks in '{data_temp}'")

✅ Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

In [70]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

# Define sets for extra removals
ORDINALS = set([
    "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th",
    # ... fill up to "1000th" if needed or cover common ordinals
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth"
])

CARDINALS = set([
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    # Extend as needed up to "thousand"
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
    "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand"
])

MONTHS = set([
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
])

WEEKDAYS = set([
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"
])

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove digits
        return [[row[0], [w for w in row[1] if not any(char.isdigit() for char in w)]] for row in lista]

def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result

# Function to remove these from token lists
def remove_extra_stopwords(lista):
    combined_remove = ORDINALS | CARDINALS | MONTHS | WEEKDAYS
    return [
        [row[0], [w for w in row[1] if w.lower() not in combined_remove]]
        for row in lista
    ]

def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        lemmatized = [token.lemma_ for token in doc]
        result.append([lista[i][0], lemmatized])
    return result

########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [71]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = remove_extra_stopwords(data)
    data = pro5(data)
    data = pro6(data)
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # original filename replacement
    filename = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # full path in data_preprocessed folder
    out_name = os.path.join(data_preprocessed, os.path.basename(filename))

    #joblib.dump(data, out_name)

  #  joblib.dump(data, out_name)
    joblib.dump(data, out_name)

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in data_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Before tagging: 1.62s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] After tagging: 40.30s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Done. Total time: 60.38s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] Before tagging: 1.53s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] After tagging: 31.77s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] Done. Total time: 49.01s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed3.pkl...
[C

In [72]:
# Store the pre-processed data
preprocessed_files = [
    'preprocessed_speeches_indexed1_temp.pkl',
    'preprocessed_speeches_indexed2_temp.pkl',
    'preprocessed_speeches_indexed3_temp.pkl',
    'preprocessed_speeches_indexed4_temp.pkl'
]

## Word-Frequencies

### Count frequencies of all tokens and display the most common words in a word cloud

In [95]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

word_counts = count_frequencies(preprocessed_files)

# Print the top 100 most common words
print("Top 100 most common words:")
for word, count in word_counts.most_common(100):
    print(f"{word}: {count}")

# Print the top 300 least common words
print("Top 300 least common words:")
for word, count in word_counts.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Remove words if they occur less than 10 times to rule out typos and other rare occurences
word_counts = Counter({word: count for word, count in word_counts.items() if count >= 10})

save_path = os.path.join(data_freq, 'word_counts.pkl')
joblib.dump(word_counts, save_path)




100%|██████████| 4/4 [00:04<00:00,  1.12s/it]

Top 100 most common words:
united: 16566
country: 15660
international: 13989
nations: 13765
world: 11762
people: 11348
peace: 9869
development: 8279
state: 8060
security: 7251
general: 6962
economic: 6570
government: 6480
year: 5927
right: 5831
assembly: 5693
organization: 5555
new: 5362
support: 5233
problem: 5143
effort: 5115
develop: 4840
nation: 4443
continue: 4430
great: 4392
human: 4368
community: 4283
political: 4136
session: 3859
time: 3855
africa: 3784
need: 3695
council: 3660
war: 3571
member: 3545
states: 3507
work: 3465
conflict: 3332
republic: 3315
principle: 3313
hope: 3302
south: 3216
power: 3200
national: 3168
situation: 3157
force: 3020
nuclear: 2995
resolution: 2980
region: 2964
action: 2944
order: 2851
solution: 2840
global: 2792
take: 2783
policy: 2775
conference: 2762
achieve: 2750
president: 2671
african: 2647
social: 2642
charter: 2591
weapon: 2585
concern: 2576
change: 2519
issue: 2507
way: 2503
operation: 2481
respect: 2450
system: 2441
interest: 2437
relation:




In [None]:
# Check again least common words after removing words that occur less than 10 times

# Print the top 300 least common words
print("Top 300 least common words:")
for word, count in word_counts.most_common()[:-301:-1]:
    print(f"{word}: {count}")

In [76]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [78]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?
# CHECK REPLICATION PACKAGE
affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

a = [[i, word_counts[i]] for i in affect if i in word_counts]
c = [[i, word_counts[i]] for i in cognition if i in word_counts]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

In [79]:
# == Calculate weighted frequencies for all words
word_counts = joblib.load(os.path.join(data_freq, 'word_counts.pkl'))

l = sum(word_counts.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}
#for key in word_counts.keys():
 #   word_counts[key] = a / (a + (word_counts[key] / l))

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")


Top 100 words by weighted frequency:
sheikha: 0.992205348533943
fields: 0.992205348533943
affirmative: 0.992205348533943
taste: 0.992205348533943
default: 0.992205348533943
abominable: 0.992205348533943
reprehensible: 0.992205348533943
shocking: 0.992205348533943
sequence: 0.992205348533943
doctor: 0.992205348533943
economist: 0.992205348533943
electronic: 0.992205348533943
pluralist: 0.992205348533943
inclined: 0.992205348533943
globalized: 0.992205348533943
laboratory: 0.992205348533943
amenable: 0.992205348533943
specious: 0.992205348533943
spurious: 0.992205348533943
tentative: 0.992205348533943
refinement: 0.992205348533943
fragment: 0.992205348533943
overlap: 0.992205348533943
restart: 0.992205348533943
miguel: 0.992205348533943
evacuate: 0.992205348533943
forcible: 0.992205348533943
bell: 0.992205348533943
indians: 0.992205348533943
uppermost: 0.992205348533943
unbalanced: 0.992205348533943
dignify: 0.992205348533943
interoceanic: 0.992205348533943
vindicate: 0.992205348533943
p

In [93]:
#== Final cleaning of preprocessed speeches ==

os.chdir(data_freq)
count = joblib.load('word_counts.pkl')  # load word frequency dictionary

os.chdir(data_temp)

# Filter words by minimum frequency
def select(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if count[w] >= 10]
        lista[i] = [x, y]
    return lista

# Clean and save each file
def final_cleaning(dataname):
    data = joblib.load(dataname)
    data = select(data)
    lab = dataname.replace('_temp', '')  # remove '_temp' in filename
    out_path = os.path.join(data_preprocessed, lab)
    joblib.dump(data, out_path)

# List of temp files to clean
preprocessed_files = [
    'preprocessed_speeches_indexed1_temp.pkl',
    'preprocessed_speeches_indexed2_temp.pkl',
    'preprocessed_speeches_indexed3_temp.pkl',
    'preprocessed_speeches_indexed4_temp.pkl'
]

# Apply cleaning
for file in preprocessed_files:
    final_cleaning(file)