In [1]:
import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np
# from gensim.summarization.textcleaner import get_sentences

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
import spacy
from multiprocessing import Pool, freeze_support
import pandas as pd


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer = SnowballStemmer("english")


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")
wd_models = wd / "models"
wd_results = wd / "results"


# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_sent = data_c / "sentences"
data_preprocessed = data_c / "preprocessed"
fig_dir = wd /"fig"

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")
word_counts = joblib.load(data_freq / "word_counts.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl')

In [2]:
os.chdir(data_temp)
cleaned_files = [
    str(data_temp / 'clean_speeches_indexed1.pkl'),
    str(data_temp / 'clean_speeches_indexed2.pkl'),
    str(data_temp / 'clean_speeches_indexed3.pkl'),
    str(data_temp / 'clean_speeches_indexed4.pkl')
]

In [3]:
os.chdir(data_sent)
def extract_sentences(dataname):
    data = joblib.load(dataname)
    data = [a[1] for a in data]  # keep only text, no id

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc)  # use nltk's sent_tokenize here

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    #texts = [" ".join(s) for s in sentences if len(s) > 0]
    #docs = list(nlp.pipe(texts, batch_size=50, n_process=1))
    #sentences = [
     #   [tok.text for tok in doc if tok.tag_.startswith(("N", "V", "J"))]
       # for doc in docs
   # ]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if word_counts[a] >= 10] for s in sentences]

    sentences = [s for s in sentences if len(s) > 1]  # eliminate empty
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    
    unique_tokens = set(token for s in sentences for token in s)
    print(f"Unique tokens: {len(unique_tokens)}")

    print(f'{lab} saved')

# Run for all your files
for fname in cleaned_files:
    extract_sentences(fname)

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl processed
Unique tokens: 8400
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed1.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl processed
Unique tokens: 8405
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed2.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed3.pkl processed
Unique tokens: 8401
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed3.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed4.pkl processed
Unique tokens: 8391
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed4.pkl saved


In [23]:
sentences_files = [
    os.path.join(data_temp, 'sentences_indexed1.pkl'),
    os.path.join(data_temp, 'sentences_indexed2.pkl'),
    os.path.join(data_temp, 'sentences_indexed3.pkl'),
    os.path.join(data_temp, 'sentences_indexed4.pkl')
]


In [29]:
# == Get sum of unique tokens
all_unique_tokens = set()

for dataname in sentences_files:
    data = joblib.load(dataname)  # load list of tokenized sentences
    for sentence in data:
        all_unique_tokens.update(sentence)  # add tokens to the set

print(f"Total unique tokens across all files: {len(all_unique_tokens)}")


Total unique tokens across all files: 8508


In [4]:
sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

In [31]:
dataset = []

for dataname in sentences_files:  # <-- your list of sentence files
    data = joblib.load(dataname)
    dataset.extend(data)  # extend instead of append if you want all sentences in a single list

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    # iterator that loops over tokenized sentences
    vector_size=300,      # Word vector dimensionality (use `vector_size` in newer gensim)
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=8,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10             # Number of iterations over the corpus
)

# Optimize memory usage (optional)
w2v.wv.fill_norms()  # only works in older gensim versions

# Save model
wd_models.mkdir(parents=True, exist_ok=True)  # create folder if it doesn't exist
w2v.save(str(wd_models / 'w2v-vectors_8_300.pkl'))

In [32]:
w2v = Word2Vec.load(str(wd_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv # Embeddings

In [33]:
# === Calculate centroids ===

def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] for w in text if w in model.wv]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    #centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/cog_centroid.pkl')


['centroids/cog_centroid.pkl']

In [34]:
# Print the vectors
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Optional: shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

print("\nAffect centroid summary: min, max, mean:", 
      np.min(affect_centroid), np.max(affect_centroid), np.mean(affect_centroid))
print("Cognition centroid summary: min, max, mean:", 
      np.min(cog_centroid), np.max(cog_centroid), np.mean(cog_centroid))

Affect centroid vector:
 [-0.00102843 -0.04505273  0.2686238   0.04995833  0.12448414 -0.22024421
  0.02002718  0.13428923  0.05709088 -0.201109   -0.09748005  0.0465512
 -0.0594778  -0.09207746 -0.13474967 -0.13982649  0.01046207  0.31272015
  0.19611344 -0.04461817  0.07295036  0.0667109   0.02109417 -0.05929884
  0.25985494 -0.07108637 -0.4274462   0.04899051  0.14073169 -0.22614689
  0.07533123 -0.0112998  -0.04141686  0.04214694 -0.01623218 -0.27087027
  0.08074587 -0.05814374  0.18459216 -0.13962166 -0.02706708  0.10797511
 -0.00773254 -0.23971277  0.07589351  0.28366277  0.06687983  0.01656772
  0.16070996  0.19863772  0.08499901 -0.24409768  0.24698676 -0.03492389
 -0.12952498  0.12853302  0.19065093 -0.17242715 -0.07315606  0.08943436
 -0.0538155  -0.06167968  0.13589464  0.10902671 -0.13337298  0.14933337
  0.01390664 -0.08452251 -0.08017185  0.11252235 -0.13780881  0.10866585
  0.17758091 -0.12374552 -0.06560096  0.01818446 -0.25835025  0.07482362
 -0.18389069  0.20401467  0

## Emotionality Score

In [36]:
# ISSUE why doesnt this work with the sentences?!

# Set wd to data_preprocessed
os.chdir(data_preprocessed)

# === Load preprocessed speech data ===

preprocessed_final_files = [
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3_final.pkl')),
   joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4_final.pkl'))
]


In [37]:
###################################
# Define Functions              ###
###################################

# apparently is missing deleting intermediate files

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            v = np.mean(vecs, axis=0)
            a = cosine(v, affect_centroid)
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(data_c, f'temp_distances_main_{idx}.pkl')
    joblib.dump(data, lab)


###################################
#      Run main directly        ###
###################################

def main():
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}_final.pkl') #Changed!
        for i in range(4)
    ]
    for i, f in enumerate(files, start=1):
        main_function(f, i)

if __name__ == "__main__":
    main()


###################################
#      Recompose everything     ###
###################################

DATA_temp = [os.path.join(data_c, f'temp_distances_main_{i+1}.pkl') for i in range(4)]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(data_c, 'distances_10epochs.pkl'))


['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\distances_10epochs.pkl']

In [38]:
# Print the first few rows
print(tot_df.head())

# Optionally, print the shape to see how many documents were processed
print("Shape:", tot_df.shape)

# Print a quick summary
print(tot_df.describe())

# Or print the full DataFrame (if small)
print(tot_df)

          filename  affect_d  cognition_d     score
0  URY_10_1955.txt  1.403596     0.978834  0.584043
1  NOR_71_2016.txt  1.383179     1.168100  0.741460
2  BHS_57_2002.txt  1.461827     1.302409  0.771474
3  CHN_54_1999.txt  1.375474     0.966137  0.604070
4  GMB_41_1986.txt  1.427820     1.020183  0.583966
Shape: (4999, 4)
          affect_d  cognition_d        score
count  4999.000000  4999.000000  4999.000000
mean      1.340100     1.065684     0.702335
std       0.248605     0.191818     0.200435
min       0.344712     0.364131     0.302257
25%       1.202359     0.939051     0.549704
50%       1.391068     1.093530     0.674553
75%       1.525907     1.213055     0.831995
max       1.768527     1.517948     1.543633
             filename  affect_d  cognition_d     score
0     URY_10_1955.txt  1.403596     0.978834  0.584043
1     NOR_71_2016.txt  1.383179     1.168100  0.741460
2     BHS_57_2002.txt  1.461827     1.302409  0.771474
3     CHN_54_1999.txt  1.375474     0.966137  

In [52]:
# Load your main corpus CSV 
un_corpus_merged = pd.read_csv(os.path.join(data_c, "un_corpus_merged.csv"), sep=';', encoding='utf-8') 
# Merge on filename 
un_corpus_scored = un_corpus_merged.merge(tot_df, on="filename", how="left")

# Optionally save back as pickle
joblib.dump(un_corpus_scored, os.path.join(data_c, "un_corpus_scored.pkl"))

un_corpus_scored.to_csv(
    os.path.join(data_c, "un_corpus_scored.csv"),
    sep=';', 
    index=False, 
    encoding='utf-8'
)

In [48]:
print(un_corpus_scored)

             filename                                             speech  \
0     URY_10_1955.txt  97.\tFrom this august rostrum, I wish to reaff...   
1     NOR_71_2016.txt  This year’s session of the General Assembly co...   
2     BHS_57_2002.txt  ﻿On behalf of Prime\nMinister Perry G. Christi...   
3     CHN_54_1999.txt  Please allow me to warmly congratulate you, Si...   
4     GMB_41_1986.txt  It is gratifying to see at the helm of this im...   
...               ...                                                ...   
4994  GAB_24_1969.txt  77. Madam President, I should like, in my turn...   
4995  IDN_78_2023.txt  Today I am wearing a traditional fabric from E...   
4996  SYC_71_2016.txt  Allow me to add Seychelles’ voice to those who...   
4997  TUR_58_2003.txt  ﻿I should like to start by\nextending our warm...   
4998  JAM_20_1965.txt  85. Mr. President, on behalf of the Jamaican d...   

     country_code  year country_name  speech_length_words  \
0             URY  1955   

In [44]:
# Count where affect_d is NaN
nan_count = un_corpus_scored['affect_d'].isna().sum()

# Count where affect_d is not NaN
not_nan_count = un_corpus_scored['affect_d'].notna().sum()

print("Count where affect_d is NaN:", nan_count)
print("Count where affect_d is not NaN:", not_nan_count)

Count where affect_d is NaN: 0
Count where affect_d is not NaN: 4999
