In [1]:
import pandas as pd
df = pd.read_csv('C:/Users/rohit/depression-risk-predictor/data/processed/sentiment_added_data.csv')


In [2]:
df_encoded = pd.get_dummies(df, columns=['gender', 'occupation'], drop_first=True)
X = df_encoded.drop(['risk', 'post_text'], axis=1)
y = df_encoded['risk']


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1)
grid_rf.fit(X_train_scaled, y_train)

best_rf = grid_rf.best_estimator_


Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [5]:
print("Best Parameters:", grid_rf.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_rf = best_rf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Accuracy: 0.8095238095238095
[[ 4  1]
 [ 3 13]]
              precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       0.93      0.81      0.87        16

    accuracy                           0.81        21
   macro avg       0.75      0.81      0.77        21
weighted avg       0.84      0.81      0.82        21



In [8]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.2


In [9]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [10]:








y_pred_xgb = xgb_model.predict(X_test_scaled)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.8095238095238095
[[ 5  0]
 [ 4 12]]
              precision    recall  f1-score   support

           0       0.56      1.00      0.71         5
           1       1.00      0.75      0.86        16

    accuracy                           0.81        21
   macro avg       0.78      0.88      0.79        21
weighted avg       0.89      0.81      0.82        21



In [11]:
import joblib
import os

os.makedirs('C:/Users/rohit/depression-risk-predictor/models', exist_ok=True)
joblib.dump(best_rf, 'C:/Users/rohit/depression-risk-predictor/models/best_random_forest.pkl')
joblib.dump(xgb_model, 'C:/Users/rohit/depression-risk-predictor/models/xgboost_model.pkl')
joblib.dump(scaler, 'C:/Users/rohit/depression-risk-predictor/models/scaler_with_sentiment.pkl')


['C:/Users/rohit/depression-risk-predictor/models/scaler_with_sentiment.pkl']