# Import Libraries

In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import pickle

# Load Data

In [2]:
file_path = "../data/external/Mental Health Dataset.csv" # File path for the dataset

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,8/27/2014 11:29,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,8/27/2014 11:31,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,8/27/2014 11:32,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


# Data Exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292364 entries, 0 to 292363
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Timestamp                292364 non-null  object
 1   Gender                   292364 non-null  object
 2   Country                  292364 non-null  object
 3   Occupation               292364 non-null  object
 4   self_employed            287162 non-null  object
 5   family_history           292364 non-null  object
 6   treatment                292364 non-null  object
 7   Days_Indoors             292364 non-null  object
 8   Growing_Stress           292364 non-null  object
 9   Changes_Habits           292364 non-null  object
 10  Mental_Health_History    292364 non-null  object
 11  Mood_Swings              292364 non-null  object
 12  Coping_Struggles         292364 non-null  object
 13  Work_Interest            292364 non-null  object
 14  Social_Weakness     

In [4]:
df.isna().sum()

Timestamp                     0
Gender                        0
Country                       0
Occupation                    0
self_employed              5202
family_history                0
treatment                     0
Days_Indoors                  0
Growing_Stress                0
Changes_Habits                0
Mental_Health_History         0
Mood_Swings                   0
Coping_Struggles              0
Work_Interest                 0
Social_Weakness               0
mental_health_interview       0
care_options                  0
dtype: int64

In [5]:
df = df.drop(columns=['Timestamp', 'Country'])

In [6]:
unique_values = dict({})

for col in df.columns:
    unique_values[col] = df[col].unique()

unique_values

{'Gender': array(['Female', 'Male'], dtype=object),
 'Occupation': array(['Corporate', 'Student', 'Business', 'Housewife', 'Others'],
       dtype=object),
 'self_employed': array([nan, 'No', 'Yes'], dtype=object),
 'family_history': array(['No', 'Yes'], dtype=object),
 'treatment': array(['Yes', 'No'], dtype=object),
 'Days_Indoors': array(['1-14 days', 'Go out Every day', 'More than 2 months',
        '15-30 days', '31-60 days'], dtype=object),
 'Growing_Stress': array(['Yes', 'No', 'Maybe'], dtype=object),
 'Changes_Habits': array(['No', 'Yes', 'Maybe'], dtype=object),
 'Mental_Health_History': array(['Yes', 'No', 'Maybe'], dtype=object),
 'Mood_Swings': array(['Medium', 'Low', 'High'], dtype=object),
 'Coping_Struggles': array(['No', 'Yes'], dtype=object),
 'Work_Interest': array(['No', 'Maybe', 'Yes'], dtype=object),
 'Social_Weakness': array(['Yes', 'No', 'Maybe'], dtype=object),
 'mental_health_interview': array(['No', 'Maybe', 'Yes'], dtype=object),
 'care_options': array(['Not

In [7]:
df.columns

Index(['Gender', 'Occupation', 'self_employed', 'family_history', 'treatment',
       'Days_Indoors', 'Growing_Stress', 'Changes_Habits',
       'Mental_Health_History', 'Mood_Swings', 'Coping_Struggles',
       'Work_Interest', 'Social_Weakness', 'mental_health_interview',
       'care_options'],
      dtype='object')

# Data Preprocessing

## Remove missing values

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

Gender                     0
Occupation                 0
self_employed              0
family_history             0
treatment                  0
Days_Indoors               0
Growing_Stress             0
Changes_Habits             0
Mental_Health_History      0
Mood_Swings                0
Coping_Struggles           0
Work_Interest              0
Social_Weakness            0
mental_health_interview    0
care_options               0
dtype: int64

## Remove duplicates

In [10]:
len(df)

287162

In [11]:
df = df.drop_duplicates()
len(df)

31794

## Feature Encoding

In [12]:
# One Hot Encoder
one_hot_encoder = OneHotEncoder()

In [13]:
X = df.drop(columns=["Mental_Health_History"])
y = df['Mental_Health_History']
encoded_features = one_hot_encoder.fit_transform(X)

In [14]:
features = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out())

In [15]:
features

Unnamed: 0,Gender_Female,Gender_Male,Occupation_Business,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student,self_employed_No,self_employed_Yes,family_history_No,...,Work_Interest_Yes,Social_Weakness_Maybe,Social_Weakness_No,Social_Weakness_Yes,mental_health_interview_Maybe,mental_health_interview_No,mental_health_interview_Yes,care_options_No,care_options_Not sure,care_options_Yes
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31789,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
31790,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
31791,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
31792,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Encode target

In [16]:
target_map = {
    'Yes': 1,
    'No': 0,
    'Maybe': 1
}

In [17]:
def preprocessing(features, target):
    one_hot_encoder = OneHotEncoder()
    encoded_features = one_hot_encoder.fit_transform(features)
    features = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out())
    target = target.apply(lambda x: target_map[x])
    return features, target

X = df.drop(columns=["Mental_Health_History"])
y = df['Mental_Health_History']

features, target = preprocessing(X, y)

# Model Training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [19]:
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)


# Model Evaluation

In [20]:
eval_result = classification_report(y_test, y_pred)
print(eval_result)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2219
           1       0.99      0.99      0.99      4140

    accuracy                           0.98      6359
   macro avg       0.98      0.98      0.98      6359
weighted avg       0.98      0.98      0.98      6359



# Save Model

In [21]:
# Save the trained model to a .pkl file
with open('../models/classifier_model.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)

# Save the trained model to a .pkl file
with open('../models/one_hot_encoder.pkl', 'wb') as file:
    pickle.dump(one_hot_encoder, file)

In [22]:
with open('../models/classifier_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)