In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

df = pd.read_csv("tasks_preprocessed.csv")
tfidf = TfidfVectorizer(max_features=1500)
X_text = tfidf.fit_transform(df['lemmas'].fillna(""))
num_feat = df[['workload_hours','assignee_completed_tasks','assignee_avg_completion_days','days_to_due']].fillna(0).values
X = hstack([X_text, num_feat])
y = df['priority']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', verbose=2)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
best = grid.best_estimator_
pred = best.predict(X_test)
print("Grid RF acc:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END max_depth=None, m