In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import os

# Load train and test data
train_df = pd.read_csv(r"./cleaned_train.csv")
test_df = pd.read_csv(r"./cleaned_test.csv")

# Use nltk word_tokenize
tokenized_train = train_df['text_clean'].dropna().apply(word_tokenize).tolist()

# Train Word2Vec on training set
w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Preview
print(f"Vocabulary size: {len(w2v_model.wv)}")
print("Sample vocab:", w2v_model.wv.index_to_key[:10])

# Save Word2Vec model
model_path = "word2vec_model.model"
w2v_model.save(model_path)
print(f"Model saved to: {os.path.abspath(model_path)}")

# Generate simple averaged Word2Vec embeddings for train and test
train_corpus = train_df['text_clean'].fillna("")
test_corpus = test_df['text_clean'].fillna("")

# Define embedding function
def get_average_w2v(text, model):
    tokens = word_tokenize(text)
    word_vecs = [model.wv[word] for word in tokens if word in model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply to train and test sets
train_features = np.array([get_average_w2v(text, w2v_model) for text in train_corpus])
test_features = np.array([get_average_w2v(text, w2v_model) for text in test_corpus])

# Save features and labels
np.save("w2v_train_features.npy", train_features)
np.save("w2v_test_features.npy", test_features)
train_df[['target']].to_csv("w2v_train_labels.csv", index=False)
test_df.to_csv("w2v_test_labels.csv", index=False)  # test set has no target so save entire dataframe


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 6524
Sample vocab: ['the', 'a', 'to', 'in', 'of', 'i', 'and', 'is', 'you', 'for']
Model saved to: /home/jovyan/Documents/cs549/tweet-classifier/datasets/word2vec_model.model
