In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


In [2]:
df = pd.read_csv('survey.csv')

In [3]:
df.head(15)

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
5,2014-08-27 11:31:22,33,Male,United States,TN,,Yes,No,Sometimes,6-25,No,Yes,Yes,Not sure,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No,
6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,Sometimes,1-5,Yes,Yes,No,No,No,No,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No,
7,2014-08-27 11:32:05,39,M,Canada,,,No,No,Never,1-5,Yes,Yes,No,Yes,No,No,Yes,Don't know,No,No,No,No,No,No,No,No,
8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,Sometimes,100-500,No,Yes,Yes,Yes,No,No,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No,
9,2014-08-27 11:32:43,23,Male,Canada,,,No,No,Never,26-100,No,Yes,Don't know,No,Don't know,Don't know,Don't know,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [5]:
df.drop(columns=['Timestamp', 'Country', 'state', 'comments'], inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        1259 non-null   int64 
 1   Gender                     1259 non-null   object
 2   self_employed              1241 non-null   object
 3   family_history             1259 non-null   object
 4   treatment                  1259 non-null   object
 5   work_interfere             995 non-null    object
 6   no_employees               1259 non-null   object
 7   remote_work                1259 non-null   object
 8   tech_company               1259 non-null   object
 9   benefits                   1259 non-null   object
 10  care_options               1259 non-null   object
 11  wellness_program           1259 non-null   object
 12  seek_help                  1259 non-null   object
 13  anonymity                  1259 non-null   object
 14  leave   

In [7]:
df['Gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male',
                     'Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
                      'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make',], 'Male', inplace = True)

df['Gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female',
                     'femail', 'Cis Female', 'cis-female/femme', 'Femake', 'Female (cis)',
                     'woman',], 'Female', inplace = True)

df["Gender"].replace(['Female (trans)', 'queer/she/they', 'non-binary',
                     'fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
                      'Agender', 'A little about you', 'Nah', 'All',
                      'ostensibly male, unsure what that really means',
                      'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?',
                      'Guy (-ish) ^_^', 'Trans woman',], 'Others', inplace = True)

df['Gender'].value_counts()

Male      991
Female    247
Others     21
Name: Gender, dtype: int64

In [8]:
df['work_interfere'] = df['work_interfere'].fillna('Do not know' )
df['self_employed'] = df['self_employed'].fillna('No')

In [9]:
df.shape

(1259, 23)

In [10]:
df.isnull().sum()

Age                          0
Gender                       0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
mental_vs_physical           0
obs_consequence              0
dtype: int64

In [11]:
df['benefits'].fillna(df['benefits'].mode()[0], inplace=True)
df['wellness_program'].fillna(df['wellness_program'].mode()[0], inplace=True)
df['leave'].fillna(df['leave'].mode()[0],inplace=True)

In [12]:
df.isnull().sum()


Age                          0
Gender                       0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
mental_vs_physical           0
obs_consequence              0
dtype: int64

In [13]:
def encode(data):
    le=LabelEncoder()
    for i in data.columns:
        if data[i].dtype=='object':
            data[i]=le.fit_transform(data[i])
    return data

encode(df)

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,0,0,0,1,2,4,0,1,2,1,1,2,2,2,1,1,1,2,1,0,2,0
1,44,1,0,0,0,3,5,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
2,32,1,0,0,0,3,4,0,1,1,0,1,1,0,1,1,1,2,2,2,2,1,0
3,31,1,0,1,1,2,2,0,1,1,2,1,1,1,1,2,2,1,0,0,0,1,1
4,31,1,0,0,0,1,1,1,1,2,0,0,0,0,0,1,1,1,2,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,26,1,0,0,1,0,2,0,1,1,0,1,1,0,2,1,1,1,1,1,1,0,0
1255,32,1,0,1,1,2,2,1,1,2,2,1,1,2,1,1,1,1,2,1,1,2,0
1256,34,1,0,1,1,4,5,0,1,2,2,1,1,0,1,2,2,0,0,1,1,1,0
1257,46,0,0,0,0,0,1,1,1,1,2,1,1,0,0,2,1,0,0,1,1,1,0


In [14]:
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,1.763e-08,0,0,0,1,2,4,0,1,2,1,1,2,2,2,1,1,1,2,1,0,2,0
1,1.77e-08,1,0,0,0,3,5,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
2,1.758e-08,1,0,0,0,3,4,0,1,1,0,1,1,0,1,1,1,2,2,2,2,1,0
3,1.757e-08,1,0,1,1,2,2,0,1,1,2,1,1,1,1,2,2,1,0,0,0,1,1
4,1.757e-08,1,0,0,0,1,1,1,1,2,0,0,0,0,0,1,1,1,2,2,2,0,0


In [15]:
y = df['treatment']
x = df.drop(['treatment'], axis=1)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 125)


In [17]:
def acc_report(actual,predicted):
    acc_score=accuracy_score(actual,predicted)
    cm_matrix=confusion_matrix(actual,predicted)
    class_rep=classification_report(actual,predicted)
    print('the accuracy of tha model is ',acc_score)
    print(cm_matrix)
    print(class_rep)

In [18]:
dt= DecisionTreeClassifier()
dt.fit(x_train, y_train)
predttrain = dt.predict(x_train)
predttest = dt.predict(x_test)

In [19]:
acc_report(y_train, predttrain)

the accuracy of tha model is  1.0
[[500   0]
 [  0 507]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       507

    accuracy                           1.00      1007
   macro avg       1.00      1.00      1.00      1007
weighted avg       1.00      1.00      1.00      1007



In [20]:
acc_report(y_test, predttest)

the accuracy of tha model is  0.7301587301587301
[[87 35]
 [33 97]]
              precision    recall  f1-score   support

           0       0.72      0.71      0.72       122
           1       0.73      0.75      0.74       130

    accuracy                           0.73       252
   macro avg       0.73      0.73      0.73       252
weighted avg       0.73      0.73      0.73       252



In [21]:
rf = RandomForestClassifier(criterion='entropy', max_depth=12)
rf.fit(x_train, y_train)
pred_rf_train = rf.predict(x_train)
pred_rf_test = rf.predict(x_test)
acc_report(y_train, pred_rf_train)

the accuracy of tha model is  0.9950347567030785
[[498   2]
 [  3 504]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       500
           1       1.00      0.99      1.00       507

    accuracy                           1.00      1007
   macro avg       1.00      1.00      1.00      1007
weighted avg       1.00      1.00      1.00      1007



In [22]:
acc_report(y_test, pred_rf_test)

the accuracy of tha model is  0.8214285714285714
[[ 90  32]
 [ 13 117]]
              precision    recall  f1-score   support

           0       0.87      0.74      0.80       122
           1       0.79      0.90      0.84       130

    accuracy                           0.82       252
   macro avg       0.83      0.82      0.82       252
weighted avg       0.83      0.82      0.82       252



In [23]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred_lr_train = lr.predict(x_train)
pred_lr_test = lr.predict(x_test)
acc_report(y_train, pred_lr_train)

the accuracy of tha model is  0.8192651439920556
[[391 109]
 [ 73 434]]
              precision    recall  f1-score   support

           0       0.84      0.78      0.81       500
           1       0.80      0.86      0.83       507

    accuracy                           0.82      1007
   macro avg       0.82      0.82      0.82      1007
weighted avg       0.82      0.82      0.82      1007



In [24]:
acc_report(y_test, pred_lr_test)

the accuracy of tha model is  0.8174603174603174
[[ 96  26]
 [ 20 110]]
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       122
           1       0.81      0.85      0.83       130

    accuracy                           0.82       252
   macro avg       0.82      0.82      0.82       252
weighted avg       0.82      0.82      0.82       252



In [25]:
adb = AdaBoostClassifier()
adb.fit(x_train, y_train)
pred_adb_train = adb.predict(x_train)
pred_adb_test = adb.predict(x_test)
acc_report(y_train, pred_adb_train)

the accuracy of tha model is  0.8381330685203575
[[384 116]
 [ 47 460]]
              precision    recall  f1-score   support

           0       0.89      0.77      0.82       500
           1       0.80      0.91      0.85       507

    accuracy                           0.84      1007
   macro avg       0.84      0.84      0.84      1007
weighted avg       0.84      0.84      0.84      1007



In [26]:
acc_report(y_test, pred_adb_test)

the accuracy of tha model is  0.8174603174603174
[[ 90  32]
 [ 14 116]]
              precision    recall  f1-score   support

           0       0.87      0.74      0.80       122
           1       0.78      0.89      0.83       130

    accuracy                           0.82       252
   macro avg       0.82      0.82      0.82       252
weighted avg       0.82      0.82      0.82       252

