In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump

# Load data
df = pd.read_csv("tasks_preprocessed.csv")

# Combine title + description text (or lemmas if you have)
texts = df["lemmas"].fillna("")

# ✅ TF-IDF for XGB model
tfidf_xgb = TfidfVectorizer(max_features=80, ngram_range=(1, 1))
X_text = tfidf_xgb.fit_transform(texts)

# ✅ Numeric features (change according to your dataset)
numeric_cols = ["workload_hours", "assignee_completed_tasks", "assignee_avg_completion_days", "days_to_due"]
X_num = df[numeric_cols].fillna(0).values

# Combine text + numeric
from scipy.sparse import hstack
X = hstack([X_text, X_num])

# Target variable
y = df["priority"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
xgb.fit(X_train.toarray(), y_train)

# Evaluate
preds = xgb.predict(X_test.toarray())
print("Priority Model Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# Save model + vectorizer
dump(xgb, "xgb_priority.joblib")
dump(tfidf_xgb, "tfidf_vectorizer_xgb.joblib")
print("✅ Priority model & vectorizer saved successfully.")


Priority Model Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       116
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        60

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

✅ Priority model & vectorizer saved successfully.
