In [23]:
import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np
# from gensim.summarization.textcleaner import get_sentences

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
tagger = nltk.perceptron.PerceptronTagger()
import joblib
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from scipy.spatial.distance import cosine
import glob
from multiprocessing import Pool, freeze_support


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
stemmer = SnowballStemmer("english")


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")
wd_models = wd / "models"
wd_results = wd / "results"


# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"
fig = wd / "Code" / "0_data_preparation_descriptives" / "fig"

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")
word_counts_stemmed = joblib.load(data_freq / "word_counts_stemmed.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")
affect_dic = joblib.load(data_dict / 'dictionary_affect.pkl')
cognition_dic = joblib.load(data_dict / 'dictionary_cognition.pkl')

In [2]:
os.chdir(data_temp)
cleaned_files = [
    str(data_temp / 'clean_speeches_indexed1.pkl'),
    str(data_temp / 'clean_speeches_indexed2.pkl'),
    str(data_temp / 'clean_speeches_indexed3.pkl'),
    str(data_temp / 'clean_speeches_indexed4.pkl')
]

In [4]:
def extract_sentences(dataname):
    data = joblib.load(dataname)
    data = [a[1] for a in data]  # keep only text, no id

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc)  # use nltk's sent_tokenize here

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if word_counts_stemmed[a] >= 10] for s in sentences]

    sentences = [s for s in sentences if len(s) > 1]  # eliminate empty
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    print(f'{lab} saved')

# Run for all your files
for fname in cleaned_files:
    extract_sentences(fname)

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed1.pkl processed
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed1.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed2.pkl processed
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed2.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed3.pkl processed
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed3.pkl saved
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\clean_speeches_indexed4.pkl processed
C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\sentences_indexed4.pkl saved


In [6]:
sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

In [17]:
dataset = []

for dataname in sentences_files:  # <-- your list of sentence files
    data = joblib.load(dataname)
    dataset.extend(data)  # extend instead of append if you want all sentences in a single list

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    # iterator that loops over tokenized sentences
    vector_size=300,      # Word vector dimensionality (use `vector_size` in newer gensim)
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=8,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10             # Number of iterations over the corpus
)

# Optimize memory usage (optional)
w2v.wv.fill_norms()  # only works in older gensim versions

# Save model
wd_models.mkdir(parents=True, exist_ok=True)  # create folder if it doesn't exist
w2v.save(str(wd_models / 'w2v-vectors_8_300.pkl'))

In [19]:
w2v = Word2Vec.load(str(wd_models / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv # Embeddings

In [25]:
##################################
# Find the centroid             ###
###################################

def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] for w in text if w in model.wv]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    centroid = centroid.reshape(1, -1)
    return centroid


affect_centroid = findcentroid(affect_dic, w2v)
cog_centroid = findcentroid(cognition_dic, w2v)


###################################
# Save                          ###
###################################

os.chdir(data_c)
joblib.dump(affect_centroid, 'centroids/affect_centroid.pkl')
joblib.dump(cog_centroid, 'centroids/cog_centroid.pkl')


['centroids/cog_centroid.pkl']

In [27]:
# Print the vectors
print("Affect centroid vector:\n", affect_centroid)
print("\nCognition centroid vector:\n", cog_centroid)

# Optional: shape and stats
print("\nShape of affect centroid:", affect_centroid.shape)
print("Shape of cognition centroid:", cog_centroid.shape)

print("\nAffect centroid summary: min, max, mean:", 
      np.min(affect_centroid), np.max(affect_centroid), np.mean(affect_centroid))
print("Cognition centroid summary: min, max, mean:", 
      np.min(cog_centroid), np.max(cog_centroid), np.mean(cog_centroid))

Affect centroid vector:
 [[ 5.24660684e-02  1.33741319e-01 -8.37163553e-02  1.05239064e-01
   5.16792433e-03 -2.74070710e-01  3.96473110e-02  2.18954951e-01
  -1.44893322e-02  4.20026295e-02 -2.16143996e-01 -4.11862917e-02
   7.74786696e-02  2.32501794e-02 -2.76877433e-01 -7.23662078e-02
   8.59653279e-02  4.53436635e-02  4.13618274e-02 -9.77352187e-02
  -1.24463655e-01 -1.70920249e-02 -1.19915092e-02 -1.06752619e-01
   2.31602266e-01 -5.45996949e-02 -1.07075058e-01  9.65755805e-03
   2.44047716e-02 -1.78962246e-01  1.51880831e-01 -4.10291441e-02
  -1.21981084e-01  1.12318166e-01 -3.43517736e-02 -1.33787235e-02
  -9.91963074e-02 -2.22331107e-01  1.52626354e-02 -8.01816881e-02
   8.87374133e-02  1.89637672e-02  7.91744441e-02 -2.94685904e-02
   1.27999365e-01  1.49500817e-01 -1.02076359e-01  5.58054410e-02
  -5.84671162e-02  5.05480124e-03  6.21940494e-02  1.62476990e-02
   5.66442125e-02  3.07438914e-02 -1.92840993e-02  1.90077394e-01
  -4.98453565e-02 -6.03052368e-03  1.99061669e-02 -

## Emotionality Score

In [30]:
# Set wd to data_preprocessed
os.chdir(data_preprocessed)

# === Load preprocessed speech data ===

preprocessed_files = [
    joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl')),
    joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl')),
    joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl')),
    joblib.load(os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl'))
]


In [None]:
###################################
# Define Functions              ###
###################################

def documentvecweight(lista):
    out = []
    lista = [i for i in lista if len(i[1]) > 0]
    for s in lista:
        vecs = [w2v.wv[w] * word_counts_weighted[w] for w in s[1] if w in w2v.wv]
        if len(vecs) == 0:
            a = np.nan
            c = np.nan
            score = np.nan
        else:
            v = np.mean(vecs, axis=0).reshape(1, -1)
            a = cosine(v, affect_centroid)
            c = cosine(v, cog_centroid)
            score = (1 + 1 - a) / (1 + 1 - c)
        out.append([s[0], a, c, score])
    return out


def main_function(file_path, idx):
    dataset = joblib.load(file_path)
    data = documentvecweight(dataset)
    lab = os.path.join(wd_data, f'temp_distances_main_{idx}.pkl')
    joblib.dump(data, lab)

###################################
#      Multiprocessing          ###
###################################

def main():
    # Build list of file paths to your preprocessed chunks
    files = [
        os.path.join(data_preprocessed, f'preprocessed_speeches_indexed{i+1}.pkl')
        for i in range(4)
    ]

    # Each worker receives (file_path, index)
    with Pool(len(files)) as pool:
        pool.starmap(main_function, [(f, i+1) for i, f in enumerate(files)])

if __name__ == "__main__":
    freeze_support()
    main()

###################################
#      Recompose everything     ###
###################################

DATA_temp = [os.path.join(wd_data, f'temp_distances_main_{i+1}.pkl') for i in range(len(preprocessed_files))]

tot = []
for dataname in DATA_temp:
    d = joblib.load(dataname)
    tot += d

tot_df = pd.DataFrame(tot, columns=['filename', 'affect_d', 'cognition_d', 'score'])
joblib.dump(tot_df, os.path.join(wd_data, 'distances_10epochs.pkl'))

In [None]:
# Load your main corpus CSV
un_corpus_merged = pd.read_csv(os.path.join(wd_data, "un_corpus_merged.csv"))

# Merge on filename
un_corpus_merged = un_corpus_merged.merge(tot_df, on="filename", how="left")

# Save updated merged dataframe
un_corpus_merged.to_csv(os.path.join(wd_data, "un_corpus_merged_with_scores.csv"), index=False)

# Optionally also save as pickle for faster later use
joblib.dump(un_corpus_merged, os.path.join(wd_data, "un_corpus_merged_with_scores.pkl"))