In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("tasks_preprocessed.csv")
# We'll use lemmas column for TF-IDF
texts = df['lemmas'].fillna("")

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(texts)

# Word2Vec (simple average of word vectors)
sentences = [s.split() for s in texts]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=2, epochs=20)

def sentence_vector(sent):
    vecs = []
    for w in sent.split():
        if w in w2v_model.wv:
            vecs.append(w2v_model.wv[w])
    if len(vecs)==0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(vecs, axis=0)

X_w2v = np.vstack([sentence_vector(s) for s in texts])

# Combine TF-IDF and w2v (optional)
from scipy.sparse import hstack
# Convert w2v to sparse to hstack (or use numpy concatenate if not sparse)
from scipy import sparse
X_w2v_sparse = sparse.csr_matrix(X_w2v)
X_combined = hstack([X_tfidf, X_w2v_sparse])

# Labels
y = df['priority']  # 0,1,2

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

print("Shapes:", X_train.shape, X_test.shape)
with open("train_test_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

print("✅ train_test_data.pkl created successfully!")


Shapes: (800, 358) (200, 358)
✅ train_test_data.pkl created successfully!


In [11]:
# file: train_classifiers.py
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from joblib import dump, load
from sklearn.model_selection import GridSearchCV

# Load prepared X_train, X_test, y_train, y_test from previous step
import pickle
with open("train_test_data.pkl","rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Naive Bayes (note: MultinomialNB expects non-negative inputs - works with TF-IDF part)
# If using combined features with negative values, better to use other models. Here we make a TF-IDF-only example.
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv("tasks_preprocessed.csv")
texts = df['lemmas'].fillna("")
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_tfidf_full = tfidf.fit_transform(texts)
y_full = df['priority']
Xtr, Xte, ytr, yte = X_tfidf_full[:400], X_tfidf_full[400:], y_full[:400], y_full[400:]

mnb = MultinomialNB()
mnb.fit(Xtr, ytr)
pred_nb = mnb.predict(Xte)
print("Naive Bayes accuracy:", accuracy_score(yte, pred_nb))
print(classification_report(yte, pred_nb))

# SVM (Linear)
svm = LinearSVC(random_state=42, max_iter=10000)
svm.fit(Xtr, ytr)
pred_svm = svm.predict(Xte)
print("SVM accuracy:", accuracy_score(yte, pred_svm))
print(classification_report(yte, pred_svm))

# Save models
dump(mnb, "model_mnb.joblib")
dump(svm, "model_svm.joblib")
dump(tfidf, "tfidf_vectorizer.joblib")
print("Models and vectorizer saved.")


Naive Bayes accuracy: 0.7416666666666667
              precision    recall  f1-score   support

           0       0.68      1.00      0.81       336
           1       0.00      0.00      0.00        82
           2       1.00      0.60      0.75       182

    accuracy                           0.74       600
   macro avg       0.56      0.53      0.52       600
weighted avg       0.69      0.74      0.68       600

SVM accuracy: 0.725
              precision    recall  f1-score   support

           0       0.69      0.96      0.80       336
           1       0.07      0.01      0.02        82
           2       0.97      0.62      0.75       182

    accuracy                           0.73       600
   macro avg       0.57      0.53      0.52       600
weighted avg       0.69      0.72      0.68       600

Models and vectorizer saved.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
