In [None]:
# Initial imports
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pandas import get_dummies
%matplotlib inline

In [2]:
# Load data into dataframe
student_df = pd.read_csv("../Resources/student_depression_dataset.csv")
student_df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [3]:
# Number of depressed students
dep_students = student_df[["Depression", "id"]].groupby("Depression").count()
dep_students

Unnamed: 0_level_0,id
Depression,Unnamed: 1_level_1
0,11565
1,16336


In [4]:
# Professions in dataset
student_df.Profession.value_counts()

Profession
Student                     27870
Architect                       8
Teacher                         6
'Digital Marketer'              3
'Content Writer'                2
Chef                            2
Doctor                          2
Pharmacist                      2
'Civil Engineer'                1
'UX/UI Designer'                1
'Educational Consultant'        1
Manager                         1
Lawyer                          1
Entrepreneur                    1
Name: count, dtype: int64

In [5]:
# Average academic pressure
ac_pres_avg = student_df.groupby(by=["Profession"])[["Academic Pressure"]].mean()
ac_pres_avg

Unnamed: 0_level_0,Academic Pressure
Profession,Unnamed: 1_level_1
'Civil Engineer',5.0
'Content Writer',5.0
'Digital Marketer',3.666667
'Educational Consultant',3.0
'UX/UI Designer',3.0
Architect,3.75
Chef,2.5
Doctor,4.0
Entrepreneur,3.0
Lawyer,4.0


In [6]:
# Unique cities in dataset
student_df.City.unique()

array(['Visakhapatnam', 'Bangalore', 'Srinagar', 'Varanasi', 'Jaipur',
       'Pune', 'Thane', 'Chennai', 'Nagpur', 'Nashik', 'Vadodara',
       'Kalyan', 'Rajkot', 'Ahmedabad', 'Kolkata', 'Mumbai', 'Lucknow',
       'Indore', 'Surat', 'Ludhiana', 'Bhopal', 'Meerut', 'Agra',
       'Ghaziabad', 'Hyderabad', 'Vasai-Virar', 'Kanpur', 'Patna',
       'Faridabad', 'Delhi', 'Saanvi', 'M.Tech', 'Bhavna', "'Less Delhi'",
       'City', '3.0', "'Less than 5 Kalyan'", 'Mira', 'Harsha', 'Vaanya',
       'Gaurav', 'Harsh', 'Reyansh', 'Kibara', 'Rashi', 'ME', 'M.Com',
       'Nalyan', 'Mihir', 'Nalini', 'Nandini', 'Khaziabad'], dtype=object)

In [7]:
# Define features
X = student_df.copy().drop(["Depression", "id"], axis=1)
X.head(20)


Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No
1,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes
2,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes
3,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes
4,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No
5,Male,29.0,Pune,Student,2.0,0.0,5.7,3.0,0.0,'Less than 5 hours',Healthy,PhD,No,4.0,1.0,No
6,Male,30.0,Thane,Student,3.0,0.0,9.54,4.0,0.0,'7-8 hours',Healthy,BSc,No,1.0,2.0,No
7,Female,30.0,Chennai,Student,2.0,0.0,8.04,4.0,0.0,'Less than 5 hours',Unhealthy,'Class 12',No,0.0,1.0,Yes
8,Male,28.0,Nagpur,Student,3.0,0.0,9.79,1.0,0.0,'7-8 hours',Moderate,B.Ed,Yes,12.0,3.0,No
9,Male,31.0,Nashik,Student,2.0,0.0,8.38,3.0,0.0,'Less than 5 hours',Moderate,LLB,Yes,2.0,5.0,No


In [8]:
# Define target
y = student_df["Depression"]
y.head()

0    1
1    0
2    0
3    1
4    0
Name: Depression, dtype: int64

In [9]:
# Use get dummies to change categorical data to True/False.
columns_to_dummy = ["Profession", "City", "Gender", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness", "Financial Stress"]
X_dummies = pd.get_dummies(X[columns_to_dummy], columns=columns_to_dummy)
X = pd.concat([X, X_dummies], axis=1)
X = X.drop(columns=columns_to_dummy, axis=1)
X

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Profession_'Civil Engineer',Profession_'Content Writer',Profession_'Digital Marketer',...,Have you ever had suicidal thoughts ?_No,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_No,Family History of Mental Illness_Yes,Financial Stress_1.0,Financial Stress_2.0,Financial Stress_3.0,Financial Stress_4.0,Financial Stress_5.0,Financial Stress_?
0,33.0,5.0,0.0,8.97,2.0,0.0,3.0,False,False,False,...,False,True,True,False,True,False,False,False,False,False
1,24.0,2.0,0.0,5.90,5.0,0.0,3.0,False,False,False,...,True,False,False,True,False,True,False,False,False,False
2,31.0,3.0,0.0,7.03,5.0,0.0,9.0,False,False,False,...,True,False,False,True,True,False,False,False,False,False
3,28.0,3.0,0.0,5.59,2.0,0.0,4.0,False,False,False,...,False,True,False,True,False,False,False,False,True,False
4,25.0,4.0,0.0,8.13,3.0,0.0,1.0,False,False,False,...,False,True,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,27.0,5.0,0.0,5.75,5.0,0.0,7.0,False,False,False,...,False,True,False,True,True,False,False,False,False,False
27897,27.0,2.0,0.0,9.40,3.0,0.0,0.0,False,False,False,...,True,False,False,True,False,False,True,False,False,False
27898,31.0,3.0,0.0,6.61,4.0,0.0,12.0,False,False,False,...,True,False,True,False,False,True,False,False,False,False
27899,18.0,5.0,0.0,6.88,2.0,0.0,10.0,False,False,False,...,False,True,True,False,False,False,False,False,True,False


In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [11]:
# Create scaler instance and fit to train dataset
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

In [12]:
# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Create random forest binary classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

In [14]:
# Fit model to scaled training data and y_train
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Make predictions for test data
predictions = rf_model.predict(X_test_scaled)

In [16]:
# Evaluate model using confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)

In [17]:
# Display results of evaluation
print("Confusion Matrix")
display(conf_matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2247,615
Actual 1,453,3661


Accuracy Score : 0.8469036697247706
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      2862
           1       0.86      0.89      0.87      4114

    accuracy                           0.85      6976
   macro avg       0.84      0.84      0.84      6976
weighted avg       0.85      0.85      0.85      6976



In [18]:
# Calculate feature importance of the model
importances = rf_model.feature_importances_

# Sort importances by value
sorted(zip(importances, X.columns), reverse=True)

[(0.13166102585448206, 'Academic Pressure'),
 (0.11968977369673613, 'Have you ever had suicidal thoughts ?_No'),
 (0.10891603451737061, 'Have you ever had suicidal thoughts ?_Yes'),
 (0.06429242118558383, 'Age'),
 (0.060375577363877005, 'CGPA'),
 (0.05701155378463896, 'Work/Study Hours'),
 (0.03668449226817565, 'Study Satisfaction'),
 (0.030220633612777534, 'Financial Stress_5.0'),
 (0.029148391675321222, 'Financial Stress_1.0'),
 (0.016775021233697906, 'Dietary Habits_Unhealthy'),
 (0.013186246564655446, 'Financial Stress_2.0'),
 (0.013141378814288656, 'Dietary Habits_Healthy'),
 (0.01087553293966067, 'Financial Stress_4.0'),
 (0.010563301569310332, 'Gender_Female'),
 (0.010540650401401192, 'Gender_Male'),
 (0.010306433239859355, "Degree_'Class 12'"),
 (0.01024994156168505, "Sleep Duration_'Less than 5 hours'"),
 (0.00967965479372522, "Sleep Duration_'7-8 hours'"),
 (0.009400272197941472, "Sleep Duration_'More than 8 hours'"),
 (0.009127039559951216, "Sleep Duration_'5-6 hours'"),
 (0

In [None]:
# Save model to file for flask api use
joblib.dump(rf_model, "../api/model.h5")