In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("tasks_preprocessed.csv")
# We'll use lemmas column for TF-IDF
texts = df['lemmas'].fillna("")

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(texts)

# Word2Vec (simple average of word vectors)
sentences = [s.split() for s in texts]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=2, epochs=20)

def sentence_vector(sent):
    vecs = []
    for w in sent.split():
        if w in w2v_model.wv:
            vecs.append(w2v_model.wv[w])
    if len(vecs)==0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(vecs, axis=0)

X_w2v = np.vstack([sentence_vector(s) for s in texts])

# Combine TF-IDF and w2v (optional)
from scipy.sparse import hstack
# Convert w2v to sparse to hstack (or use numpy concatenate if not sparse)
from scipy import sparse
X_w2v_sparse = sparse.csr_matrix(X_w2v)
X_combined = hstack([X_tfidf, X_w2v_sparse])

# Labels
y = df['priority']  # 0,1,2

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

print("Shapes:", X_train.shape, X_test.shape)
with open("train_test_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

print("✅ train_test_data.pkl created successfully!")


Shapes: (800, 358) (200, 358)
✅ train_test_data.pkl created successfully!
