In [109]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import pickle
import joblib

In [110]:
df = pd.read_csv("CleanedColumn.csv", delimiter=",")
df.info()

In [111]:
df.drop(columns=['idart', 'idrt','province', 'work_status', 'education', 'gender', 'age', 'place_of_injury'], inplace=True)
print(df.columns.tolist())

In [112]:
df["weight_final"] = df["weight_final"].astype(str).str.replace(",", "").astype(float)

In [113]:
df["weight_normal"] = df["weight_normal"].astype(str).str.replace(",", "").astype(float)

In [114]:
df.drop(columns=['weight_normal', 'weight_final', 'PSU', 'STRATA'], inplace=True)

In [115]:
df['emotional_mental_health_disorder'].value_counts()

In [116]:
df_0 = df[df['emotional_mental_health_disorder'] == 0]
df_1 = df[df['emotional_mental_health_disorder'] == 1]

df_sample = df_0.sample(n=20628)

print(df_sample)

In [117]:
df_mental = pd.concat([df_sample, df_1])

df_mental.sample(frac=1, random_state=1).reset_index(drop=True)

In [118]:
print(df['emotional_mental_health_disorder'].unique()) 

In [119]:
X = df_mental.drop(['emotional_mental_health_disorder'], axis=1).values
y = df_mental['emotional_mental_health_disorder'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [120]:
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],      
    'max_depth': [3, 5, 7],                   
    'n_estimators': [100, 200],         
    'subsample': [0.7, 0.9],           
    'colsample_bytree': [0.7, 0.9, 1.0],     
    'gamma': [0, 0.1, 0.3],                 
    'min_child_weight': [1, 3, 5],          
    'reg_alpha': [0.1, 1.0],             
    'reg_lambda': [0.1, 1.0] 
}

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_gscv = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_gscv.fit(X_train_pca, y_train)
xgb_best = xgb_gscv.best_estimator_
xgb_pred = xgb_best.predict(X_test_pca)
xgb_acc = accuracy_score(y_test, xgb_pred) * 100

print("XGBoost Best Params:", xgb_gscv.best_params_)
print(f"XGBoost Accuracy: {xgb_acc:.4f}")
print(classification_report(y_test, xgb_pred))

In [121]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5],
}

rf_model = RandomForestClassifier(random_state=42)
rf_gscv = GridSearchCV(rf_model, rf_param_grid, cv=5, verbose=1)
rf_gscv.fit(X_train_pca, y_train)
rf_best = rf_gscv.best_estimator_
rf_pred = rf_best.predict(X_test_pca)
rf_acc = accuracy_score(y_test, rf_pred) * 100

print("RF Best Params:", rf_gscv.best_params_)
print(f"RF Accuracy: {rf_acc:.4f}")
print(classification_report(y_test, rf_pred))

In [122]:
logreg_param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'solver': ['saga'],
    'l1_ratio': [0.3, 0.5, 0.7],
    'class_weight': ['balanced', None],
}

lr_model = LogisticRegression(max_iter=5000)
lr_gscv = GridSearchCV(lr_model, logreg_param_grid, cv=5, scoring='accuracy', verbose=1)
lr_gscv.fit(X_train_pca, y_train)
lr_best = lr_gscv.best_estimator_
lr_pred = lr_best.predict(X_test_pca)
lr_acc = accuracy_score(y_test, lr_pred) * 100

print("Logistic Regression Best Params:", lr_gscv.best_params_)
print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(classification_report(y_test, lr_pred))

In [None]:
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 50, 25)],
    'activation': ['relu', 'tanh'],       
    'alpha': [0.0001, 0.001, 0.01],       
    'learning_rate_init': [0.001, 0.01, 0.1], 
    'batch_size': [32, 64, 128],              
    'solver': ['adam', 'sgd'],                
    'validation_fraction': [0.1]  
}

mlp_model = MLPClassifier(max_iter=1000)
mlp_gscv = GridSearchCV(mlp_model, mlp_param_grid, cv=5, scoring='accuracy', verbose=1)
mlp_gscv.fit(X_train_pca, y_train)
mlp_best = mlp_gscv.best_estimator_
mlp_pred = mlp_best.predict(X_test_pca)
mlp_acc = accuracy_score(y_test, mlp_pred) * 100

print("NN Best Params:", mlp_gscv.best_params_)
print(f"NN Accuracy: {mlp_acc:.4f}")
print(classification_report(y_test, mlp_pred))

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

svm_model = SVC(probability=True)
svm_gscv = GridSearchCV(svm_model, svm_param_grid, cv=5, scoring='accuracy', verbose=1)
svm_gscv.fit(X_train_pca, y_train)
svm_best = svm_gscv.best_estimator_
svm_pred = svm_best.predict(X_test_pca)
svm_acc = accuracy_score(y_test, svm_pred) * 100

print("SVM Best Params:", svm_gscv.best_params_)
print(f"SVM Accuracy: {svm_acc:.4f}")
print(classification_report(y_test, svm_pred))

In [None]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]# 1 for Manhattan distance, 2 for Euclidean distance
}

knn_model = KNeighborsClassifier()
knn_gscv = GridSearchCV(knn_model, knn_param_grid, cv=5, scoring='accuracy', verbose=1)
knn_gscv.fit(X_train_pca, y_train)
knn_best = knn_gscv.best_estimator_
knn_pred = knn_best.predict(X_test_pca)
knn_acc = accuracy_score(y_test, knn_pred) * 100

print("k-NN Best Params:", knn_gscv.best_params_)
print(f"k-NN Accuracy: {knn_acc:.4f}")
print(classification_report(y_test, knn_pred))

In [None]:
lrprint(f"Accuracy Scores:\nLogistic Regression: {lr_acc:.4f}\nXGBoost: {xgb_acc:.4f}\nRandom Forest: {rf_acc:.4f}\nNeural Network: {mlp_acc:.4f}\nSVM: {svm_acc:.4f}\nk-NN: {knn_acc:.4f}")

In [None]:
def pickleModel(model, filename):
    pickle.dump(model, open(filename, 'wb'))

pickleModel(rf_best, 'models/RF.pkl')
pickleModel(xgb_best, 'models/XGB.pkl')
pickleModel(lr_best, 'models/LR.pkl')
pickleModel(mlp_best, 'models/NN.pkl')
pickleModel(svm_best, 'models/SVM.pkl')
pickleModel(knn_best, 'models/KNN.pkl')

In [None]:
X_train_df = pd.DataFrame(X_train, columns=df.drop(columns=["emotional_mental_health_disorder"]).columns)

feature_importance = rf_model.feature_importances_
features = np.array(X_train_df.columns)

sorted_idx = np.argsort(feature_importance)[::-1]

plt.figure(figsize=(10,6))
plt.barh(features[sorted_idx][:10], feature_importance[sorted_idx][:10], color='teal')
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Top 10 Important Features for emotional mental health disorder")
plt.gca().invert_yaxis() 
plt.show()

