In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, classification_report
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data = data.drop('id', axis=1)
data = data.drop('Name', axis=1)
data = data.drop('City', axis=1)

In [4]:
col = ["Gender","Working Professional or Student","Profession","Sleep Duration","Dietary Habits","Degree","Have you ever had suicidal thoughts ?","Family History of Mental Illness"]

In [5]:
label_encoder = LabelEncoder()
for column in col:
    data[column] = label_encoder.fit_transform(data[column])

In [6]:
data.corr()["Depression"]

Gender                                   0.008144
Age                                     -0.564671
Working Professional or Student         -0.520790
Profession                               0.353820
Academic Pressure                        0.475037
Work Pressure                            0.216634
CGPA                                     0.021729
Study Satisfaction                      -0.168014
Job Satisfaction                        -0.168543
Sleep Duration                           0.014454
Dietary Habits                           0.143445
Degree                                  -0.045514
Have you ever had suicidal thoughts ?    0.349066
Work/Study Hours                         0.191746
Financial Stress                         0.227237
Family History of Mental Illness         0.016502
Depression                               1.000000
Name: Depression, dtype: float64

In [7]:
data.drop(columns=["CGPA","Gender","Sleep Duration","Degree","Family History of Mental Illness"],axis=1,inplace=True)

In [10]:
X = data.drop('Depression', axis=1)
y = data['Depression']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)
xgb = XGBClassifier(n_estimators=100,enable_categorical=True)
xgb.fit(X_train, y_train)

preds = xgb.predict(X_test)
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
class_report = classification_report(y_test, preds)
conf_matrix = confusion_matrix(y_test, preds)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9384979862591802
Precision: 0.8370410077727151
Recall: 0.8190401258851299
F1 Score: 0.8279427359490986

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96     34584
           1       0.84      0.82      0.83      7626

    accuracy                           0.94     42210
   macro avg       0.90      0.89      0.90     42210
weighted avg       0.94      0.94      0.94     42210

Confusion Matrix:
 [[33368  1216]
 [ 1380  6246]]
