In [None]:
# Load packages

import os
import re
from string import punctuation

import pandas as pd
import joblib
import gensim
import nltk
from nltk.stem.snowball import SnowballStemmer
from multiprocessing import Pool, freeze_support

# Initialize punctuation translator for removal, POS tagger and Snowball Stemmer
translator = str.maketrans('', '', punctuation)
tagger = nltk.perceptron.PerceptronTagger()
stemmer = SnowballStemmer("english")

# Set working directory (please adjust)

wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# Define your temp folder path

data_temp = r".\data\temp"

data_c = r".\data"

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" → "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" → "international")
    content = re.sub(r'-\s', '', content)

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content


# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1_n.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2_n.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3_n.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4_n.pkl')

print(f"✅ Saved clean speeches chunks in '{data_temp}'")

###################################
#   Functions                   ###
###################################

def pro1(lista):
    a = [[row[0], row[1].translate(translator)] for row in lista]
    return a

# Tokenize etc
def pro2(lista):
    a = [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]
    return a

# Eliminate digits
def pro3(lista):
    a = [[row[0], [w for w in row[1] if not w.isdigit()]] for row in lista]
    return a

# Drop words that are too short
def pro4(lista):
    a = [[row[0], [w for w in row[1] if len(w)>2]] for row in lista]
    return a

# Tag parts of speech and keep only some
def tags(lista):
    t = [[row[0], tagger.tag(row[1])] for row in lista]
    t = [[row[0], [i[0] for i in row[1] if i[1].startswith(('N', 'V', 'J'))]] for row in t]
    return t

# Stem
def pro5(lista):
    a = [[row[0], [stemmer.stem(word) for word in row[1]]] for row in lista]
    return a

# Eliminate Stopwords
os.chdir(data_c)
stopwords = joblib.load('stopwords.pkl')
proc = joblib.load('procedural_words.pkl')
stopwords = set(stopwords).union(proc)
del proc
def pro6(lista):
    for i in range(len(lista)):
        x = lista[i][0]
        y = lista[i][1]
        y = [w for w in y if w not in stopwords]
        lista[i] = [x, y]
    return lista

# Drop empty speeches
def dropnull(lista):
    a = [row for row in lista if len(' '.join(row[1]))>0]
    return a


###################################
#   Main                       ###
###################################


def preprocessing(data_name):
    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)
    data = tags(data)
    data = pro5(data)
    data = pro6(data)
    data = dropnull(data)
    lab = data_name.replace('.pkl', '') + '_temp.pkl'
    joblib.dump(data, lab)


###################################
#      Multiprocessing          ###
###################################


data_files = [[a] for a in data_files]


def main():
    with Pool(4) as pool:
        pool.starmap(preprocessing, data_files)

if __name__ == "__main__":
    freeze_support()
    main()