In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import os

# NLTK setup
nltk.download('punkt')

# Load data
train_df = pd.read_csv(r"./cleaned_train.csv")
test_df = pd.read_csv(r"./cleaned_test.csv")

# Step 1: Train Word2Vec
tokenized_train = train_df['text_clean'].dropna().apply(word_tokenize).tolist()

w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Step 2: Train TF-IDF
train_corpus = train_df['text_clean'].fillna("")
tfidf = TfidfVectorizer()
tfidf_train_matrix = tfidf.fit_transform(train_corpus)

test_corpus = test_df['text_clean'].fillna("")
tfidf_test_matrix = tfidf.transform(test_corpus)

# Save TF-IDF matrices
sparse.save_npz(r"./tfidf_train_matrix.npz", tfidf_train_matrix)
sparse.save_npz(r"./tfidf_test_matrix.npz", tfidf_test_matrix)

print("TF-IDF matrices saved.")

# Step 3: Compute Weighted Word2Vec embeddings
idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)

# Compute features
train_features_w2v = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in train_corpus])
test_features_w2v = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in test_corpus])

# Save Word2Vec features
np.save(r"./w2v_tfidf_train_features.npy", train_features_w2v)
np.save(r"./w2v_tfidf_test_features.npy", test_features_w2v)

# Save labels
train_df[['target']].to_csv(r"./w2v_tfidf_train_labels.csv", index=False)
test_df.to_csv(r"./w2v_tfidf_test_labels.csv", index=False)

print("Word2Vec weighted embeddings, TF-IDF matrices, and labels saved!")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF matrices saved.
Word2Vec weighted embeddings, TF-IDF matrices, and labels saved!
