In [1]:
# Emotion and Reason in Political Language: Examining the UN General speeches
### Sarah Franzen

# Description: 
# - Extract documents from their original txt and store them as one csv
# - 


## Setup, Installation and Verification of required Packages and Libraries

In [5]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys
    !{sys.executable} -m pip install pandas
    !{sys.executable} -m pip install nltk
    !{sys.executable} -m pip install spacy
    !{sys.executable} -m pip install numpy
    !{sys.executable} -m pip install gensim

#########################
# Check if all packages are included
##########################

In [7]:
# Import specific functions and classes from libraries

import os
import re
import random # for shorter samples
from string import punctuation

import pandas as pd
import joblib
import gensim
import nltk
import spacy
from nltk.corpus import stopwords
# from nltk.stem.snowball import SnowballStemmer
from multiprocessing import Pool, freeze_support
from collections import Counter
from itertools import chain
from tqdm import tqdm

# Initialize punctuation translator for removal, POS tagger and Snowball Stemmer
translator = str.maketrans('', '', punctuation)
tagger = nltk.perceptron.PerceptronTagger()
# stemmer = SnowballStemmer("english")

# Set working directory (please adjust)

wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# Define other folder paths

data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')

data_freq = os.path.join(data_c, 'freq')

In [9]:
# Set DownloadAdditions to True if you need to download these additional resources.

DownloadAdditions = False
if DownloadAdditions:
    nltk.download('stopwords')                   # Download NLTK stopwords corpus
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)
    nltk.download('punkt')                       # Download the NLTK tokenizer models

# Ensure the NLTK stopword, en_core_web_lg and NLTK tokenizer models are available

try:
    stopwords.words('english')
    print("NLTK stopwords are available.")
except LookupError:
    print("NLTK stopwords are not available.")

# Ensure the spaCy model is available
try:
    nlp = spacy.load('en_core_web_lg')
    print("spaCy model is available.")
except OSError:
    print("spaCy model is not available.")

# Ensure the NLTK 'punkt' tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK 'punkt' tokenizer is available.")
except LookupError:
    print("NLTK 'punkt' tokenizer is not available.")

NLTK stopwords are available.
spaCy model is available.
NLTK 'punkt' tokenizer is available.


In [10]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


## Load and Prepare Corpus for Preprocessing

In [12]:
# Load data from txt-files into one csv-file 

# Define the folder path for the original data      
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

# STEP 1: First gather all matching file paths
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

# STEP 2: Sample 300 randomly
sampled_files = random.sample(all_txt_files, 300)

# STEP 3: Now use your preferred file-reading structure
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})
        
'''
# For entire data
# Create a list to hold the data
raw_data = []

# Walk through all subfolders and process each .txt file
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            filepath = os.path.join(root, file)
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()

                raw_data.append({'filename': file, 'speech': content})

'''

# Create DataFrame from the collected data
df_raw = pd.DataFrame(raw_data)

# Save df_raw so it doesn't have to be created every time again
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Save DataFrame to CSV with semicolon separator so Excel opens it correctly
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\n✅ Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")



✅ Saved raw data with 300 speeches to '.\data\un_corpus_raw.csv'


In [13]:
# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# View df to understand structure
df_raw.head()         # Shows the first 5 rows

Unnamed: 0,filename,speech
0,LSO_28_1973.txt,﻿210.\t It is a source of great pleasure for m...
1,COD_42_1987.txt,"﻿Mr, President, at this forty-second session, ..."
2,ROU_42_1987.txt,"﻿Your election, Sir, to the high post of Presi..."
3,ISR_29_1974.txt,"At the outset of my remarks, I wish to associa..."
4,FRA_01_1946.txt,Before I speak of anything else I would like f...


## Pre-processing

### Cleaning

In [18]:
# Clean speeches

def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" → "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content


# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [19]:
# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1_n.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2_n.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3_n.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4_n.pkl')

print(f"✅ Saved clean speeches chunks in '{data_temp}'")

✅ Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


In [20]:
# List the chunk filenames
data_files = [
    'cleanspeeches_indexed1_n.pkl',
    'cleanspeeches_indexed2_n.pkl',
    'cleanspeeches_indexed3_n.pkl',
    'cleanspeeches_indexed4_n.pkl'
]

In [21]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


### Tokenize, Eliminate digits and stopwords, POS-Tagging

In [None]:
###################################
#   Functions                   ###
###################################

os.chdir(data_c)

def pro1(lista):
    a = [[row[0], row[1].translate(translator)] for row in lista]
    return a

# Tokenize etc
def pro2(lista):
    a = [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]
    return a

# Eliminate digits
def pro3(lista):
    a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
    return a

# Drop words that are too short
def pro4(lista):
    a = [[row[0], [w for w in row[1] if len(w)>2]] for row in lista]
    return a

# Tag parts of speech and keep only nouns, verbs and adjectives
def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t

# Remove stopwords
stop_words = set(stopwords.words('english'))

def pro5(lista):
    a = [[row[0], [w for w in row[1] if w not in stop_words]] for row in lista]
    return a

# Stem
#def pro5(lista):
  #  a = [[row[0], [stemmer.stem(word) for word in row[1]]] for row in lista]
   # return a

# Drop empty speeches
def dropnull(lista):
    a = [row for row in lista if len(' '.join(row[1]))>0]
    return a


###################################
#   Main                       ###
###################################

def preprocessing(data_name):
    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)
    data = tags(data)
    data = pro5(data)
    data = dropnull(data)
    lab = data_name.replace('.pkl', '') + '_temp.pkl'
    joblib.dump(data, lab)


###################################
#      Multiprocessing          ###
###################################

os.chdir(data_c)

def main():
    with Pool(4) as pool:
        pool.starmap(preprocessing, data_files)

if __name__ == "__main__":
    freeze_support()
    main()


In [None]:
def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data)
        total_freq.update(tokens)
    return total_freq

freqs = count_frequencies(data_files)

joblib.dump(freqs, r"path_to_save\word_counts.pkl")