In [27]:
import os
import gensim
from pathlib import Path
from gensim.models import Word2Vec
import numpy as np
# from gensim.summarization.textcleaner import get_sentences

from nltk.tokenize import sent_tokenize
from random import shuffle
import nltk
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
tagger = nltk.perceptron.PerceptronTagger()
import joblib
from nltk.stem.snowball import SnowballStemmer


# === Initialize NLP Tools ===

translator = str.maketrans('', '', punctuation) 
tagger = nltk.perceptron.PerceptronTagger()
stemmer = SnowballStemmer("english")


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit")
wd_model = Path(r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\models")

# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = wd / "data"
data_temp = data_c / "temp"
data_freq = data_c / "freq"
data_dict = data_c / "dictionaries"
data_preprocessed = data_c / "preprocessed"
fig = wd / "Code" / "0_data_preparation_descriptives" / "fig"

# Upload ressources
stopwords = joblib.load(data_c / "stopwords.pkl")
word_count = joblib.load(data_freq / "word_counts_stemmed.pkl")
word_counts_weighted = joblib.load(data_freq / "word_counts_weighted.pkl")

In [43]:
os.chdir(data_temp)

In [45]:
cleaned_files = [
    data_temp / 'clean_speeches_indexed1.pkl',
    data_temp / 'clean_speeches_indexed2.pkl',
    data_temp / 'clean_speeches_indexed3.pkl',
    data_temp / 'clean_speeches_indexed4.pkl'
]

In [47]:
def extract_sentences(dataname):
    data = joblib.load(dataname)
    data = [a[1] for a in data]  # keep only text, no id

    sentences = []
    for doc in data:
        sentences += sent_tokenize(doc)  # use nltk's sent_tokenize here

    sentences = [item for item in sentences if len(item.split()) > 1]
    sentences = [gensim.utils.simple_preprocess(item) for item in sentences]

    sentences = [[a for a in s if not a.isdigit()] for s in sentences]
    sentences = [[a for a in s if len(a) > 2] for s in sentences]

    sentences = [tagger.tag(s) for s in sentences]
    sentences = [[i[0] for i in s if i[1].startswith(('N', 'V', 'J'))] for s in sentences]

    sentences = [[stemmer.stem(i) for i in s] for s in sentences]
    sentences = [[a for a in s if a not in stopwords] for s in sentences]
    sentences = [[a for a in s if count[a] >= 10] for s in sentences]

    sentences = [s for s in sentences if len(s) > 1]  # eliminate empty
    shuffle(sentences)

    lab = dataname.replace('clean_speeches_', 'sentences_').replace('_.pkl', '.pkl')
    print(f'{dataname} processed')
    joblib.dump(sentences, lab)
    print(f'{lab} saved')

# Run for all your files
for fname in cleaned_files:
    extract_sentences(fname)

clean_speeches_indexed1.pkl processed
sentences_indexed1.pkl saved
clean_speeches_indexed2.pkl processed
sentences_indexed2.pkl saved
clean_speeches_indexed3.pkl processed
sentences_indexed3.pkl saved
clean_speeches_indexed4.pkl processed
sentences_indexed4.pkl saved


In [55]:
sentences_files = [
    'sentences_indexed1.pkl',
    'sentences_indexed2.pkl',
    'sentences_indexed3.pkl',
    'sentences_indexed4.pkl']

In [71]:
dataset = []

for dataname in sentences_files:  # <-- your list of sentence files
    data = joblib.load(dataname)
    dataset.extend(data)  # extend instead of append if you want all sentences in a single list

# === Model training ===
w2v = Word2Vec(
    sentences=dataset,    # iterator that loops over tokenized sentences
    vector_size=300,      # Word vector dimensionality (use `vector_size` in newer gensim)
    window=8,             # Context window size
    min_count=10,         # Minimum word count
    workers=8,            # Number of threads
    sample=1e-3,          # Downsample setting for frequent words
    epochs=10             # Number of iterations over the corpus
)

# Optimize memory usage (optional)
w2v.wv.fill_norms()  # only works in older gensim versions

# Save model
wd_model.mkdir(parents=True, exist_ok=True)  # create folder if it doesn't exist
w2v.save(str(wd_model / 'w2v-vectors_8_300.pkl'))

In [29]:
affect = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition = os.path.join(data_dict, 'dictionary_cognition.pkl')
w2v = Word2Vec.load(str(wd_model / "w2v-vectors_8_300.pkl"))
word_vectors = w2v.wv

In [33]:
##################################
# Find the centroid             ###
###################################

def findcentroid(text, model):
    vecs = [model.wv[w] * word_counts_weighted[w] for w in text if w in model.wv]
    vecs = [v for v in vecs if len(v) > 0]
    centroid = np.mean(vecs, axis=0)
    centroid = centroid.reshape(1, -1)
    return centroid


c_affect = findcentroid(affect, w2v)
c_cognition = findcentroid(cognition, w2v)


###################################
# Save                          ###
###################################

os.chdir(data_c)
joblib.dump(data_c, 'centroids/affect_centroid.pkl')
joblib.dump(data_c, 'centroids/cog_centroid.pkl')


['centroids/cog_centroid.pkl']