#### Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

#### Loading Cleaned Dataset

In [1482]:
#Loading cleaned dataset
df=pd.read_csv("clean_data.csv")
df.head(5)

Unnamed: 0,Age,Gender,Country,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,female,United States,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,male,United States,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,male,Canada,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,male,United Kingdom,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,male,United States,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


#### Finding Unique Values

In [1483]:
df["family_history"].unique()

array(['No', 'Yes'], dtype=object)

In [1484]:
df["seek_help"].unique()

array(['Yes', "Don't know", 'No'], dtype=object)

In [1485]:
df["care_options"].unique()

array(['Not sure', 'No', 'Yes'], dtype=object)

In [1486]:
df["work_interfere"].unique()

array(['Often', 'Rarely', 'Never', 'Sometimes', 'Unknown'], dtype=object)

In [1487]:
df["remote_work"].unique()

array(['No', 'Yes'], dtype=object)

In [1488]:
df['leave'].unique()

array(['Somewhat easy', "Don't know", 'Somewhat difficult',
       'Very difficult', 'Very easy'], dtype=object)

#### Encoding Categorial Features

In [1489]:
features= ['family_history','work_interfere','mental_vs_physical','obs_consequence','benefits','care_options',
           'wellness_program','seek_help','leave','coworkers','supervisor','mental_health_consequence',
           'phys_health_consequence','no_employees','remote_work','tech_company']
target=['treatment']

data= df[features + target].dropna().copy()

X=data[features]
y=data[target]

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
preprocessor = ColumnTransformer([('category', ohe, features)])
X_encoded = preprocessor.fit_transform(X)


#### Preprocessing

In [None]:
#standardization
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X_encoded)

#splitting 
X_train, X_test, y_train ,y_test = train_test_split( X_scaled, y, test_size=0.3, random_state=42)
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

#### Logistic Regression

In [1491]:
LR=LogisticRegression(multi_class='multinomial',solver='lbfgs',random_state=42,max_iter=1000)
LR.fit(X_train,y_train)
y_pred=LR.predict(X_test)
y_proba = LR.predict_proba(X_test)[:,1]

accuracy= accuracy_score(y_test,y_pred)
confusion=confusion_matrix(y_test,y_pred)
f1=f1_score(y_test,y_pred, average='weighted')
classification=classification_report(y_test,y_pred)
roc = roc_auc_score(y_test, y_proba)

print("Logistic Regression")
print("Accuracy :",accuracy)    
print("ROC AUC  :",roc)
print("F1 Score :",f1)
print("Confusion Matrix :\n",confusion)
print("Classification Report :\n",classification)

Logistic Regression
Accuracy : 0.8293333333333334
ROC AUC  : 0.9015983301881397
F1 Score : 0.828990530846485
Confusion Matrix :
 [[138  36]
 [ 28 173]]
Classification Report :
               precision    recall  f1-score   support

          No       0.83      0.79      0.81       174
         Yes       0.83      0.86      0.84       201

    accuracy                           0.83       375
   macro avg       0.83      0.83      0.83       375
weighted avg       0.83      0.83      0.83       375



#### Random Forest Classifier

In [1492]:
rf=RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

accuracy= accuracy_score(y_test,y_pred)
confusion=confusion_matrix(y_test,y_pred)
f1=f1_score(y_test,y_pred, average='weighted')
classification=classification_report(y_test,y_pred)
roc = roc_auc_score(y_test, y_proba)

print("Random Forest Classifier")
print("Accuracy :",accuracy)    
print("ROC AUC  :",roc)
print("F1 Score :",f1)
print("Confusion Matrix :\n",confusion)
print("Classification Report :\n",classification)

Random Forest Classifier
Accuracy : 0.8026666666666666
ROC AUC  : 0.8890461485675073
F1 Score : 0.8026666666666666
Confusion Matrix :
 [[137  37]
 [ 37 164]]
Classification Report :
               precision    recall  f1-score   support

          No       0.79      0.79      0.79       174
         Yes       0.82      0.82      0.82       201

    accuracy                           0.80       375
   macro avg       0.80      0.80      0.80       375
weighted avg       0.80      0.80      0.80       375



## K Neighbors Classifier

In [1493]:
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
y_proba = knn.predict_proba(X_test)[:,1]

accuracy= accuracy_score(y_test,y_pred)
confusion=confusion_matrix(y_test,y_pred)
f1=f1_score(y_test,y_pred, average='weighted')
classification=classification_report(y_test,y_pred)
roc = roc_auc_score(y_test, y_proba)

print("K neighbors Classifier")
print("Accuracy :",accuracy)    
print("ROC AUC  :",roc)
print("F1 Score :",f1)
print("Confusion Matrix :\n",confusion)
print("Classification Report :\n",classification)

K neighbors Classifier
Accuracy : 0.7386666666666667
ROC AUC  : 0.803253845713959
F1 Score : 0.7389269292055544
Confusion Matrix :
 [[135  39]
 [ 59 142]]
Classification Report :
               precision    recall  f1-score   support

          No       0.70      0.78      0.73       174
         Yes       0.78      0.71      0.74       201

    accuracy                           0.74       375
   macro avg       0.74      0.74      0.74       375
weighted avg       0.74      0.74      0.74       375



## Decision Tree classifier

In [1494]:
dfc=DecisionTreeClassifier(random_state=42, max_depth=3)
dfc.fit(X_train,y_train)
y_pred=dfc.predict(X_test)
y_proba = dfc.predict_proba(X_test)[:,1]

accuracy= accuracy_score(y_test,y_pred)
confusion=confusion_matrix(y_test,y_pred)
f1=f1_score(y_test,y_pred, average='weighted')
classification=classification_report(y_test,y_pred)
roc = roc_auc_score(y_test, y_proba)

print("Decision Forest Classifier")
print("Accuracy :",accuracy)    
print("ROC AUC  :",roc)
print("F1 Score :",f1)
print("Confusion Matrix :\n",confusion)
print("Classification Report :\n",classification)

Decision Forest Classifier
Accuracy : 0.8453333333333334
ROC AUC  : 0.8666723852004347
F1 Score : 0.8425325513196481
Confusion Matrix :
 [[126  48]
 [ 10 191]]
Classification Report :
               precision    recall  f1-score   support

          No       0.93      0.72      0.81       174
         Yes       0.80      0.95      0.87       201

    accuracy                           0.85       375
   macro avg       0.86      0.84      0.84       375
weighted avg       0.86      0.85      0.84       375

