In [2]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Boosting Models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [5]:
# ===== Solution 1: Use raw string (r prefix) =====
df = pd.read_csv('survey.csv')
df.head()


Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [None]:
df.drop(['Timestamp','comments'],axis=1,inplace=True)
df.drop_duplicates()


Age                            0
Gender                         0
Country                        0
state                        515
self_employed                 18
family_history                 0
treatment                      0
work_interfere               264
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       0
care_options                   0
wellness_program               0
seek_help                      0
anonymity                      0
leave                          0
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
dtype: int64

In [7]:
df.isnull().sum()

Age                            0
Gender                         0
Country                        0
state                        515
self_employed                 18
family_history                 0
treatment                      0
work_interfere               264
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       0
care_options                   0
wellness_program               0
seek_help                      0
anonymity                      0
leave                          0
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
dtype: int64

In [9]:
# Before
print(df[['self_employed', 'work_interfere']].isnull().sum())
# After fill (example)
df['self_employed'] = df['self_employed'].fillna('No')
df['work_interfere'] = df['work_interfere'].fillna('Do Not Know')

# After
print(df[['state', 'self_employed', 'work_interfere']].isnull().sum())

self_employed      18
work_interfere    264
dtype: int64
state             515
self_employed       0
work_interfere      0
dtype: int64


In [10]:
#print(df['Age'].value_counts())

#Age column need cleaning manny age are 1000 or in (-)
# Keep only rows where age is between 18 and 100
df = df[(df['Age'] >= 18) & (df['Age'] <= 100)]
# Check the cleaned Age column
df['Age'].value_counts()

Age
29    85
32    82
26    75
27    71
33    70
28    68
31    67
34    65
30    63
25    61
35    55
23    51
24    46
37    43
38    39
36    37
39    33
40    33
43    28
41    21
22    21
42    20
21    16
45    12
46    12
44    11
19     9
18     7
50     6
20     6
48     6
51     5
56     4
49     4
55     3
57     3
54     3
47     2
60     2
58     1
62     1
65     1
53     1
61     1
72     1
Name: count, dtype: int64

In [11]:
df['Gender'].value_counts()
# Step 1: Clean formatting
df['Gender'] = df['Gender'].str.lower().str.strip()

male_terms = [
    'male', 'm', 'man', 'cis male', 'cis man', 'make', 'malr', 'msle', 'mail',
    'guy (-ish) ^^', 'male (cis)', 'male-ish', 'maile', 'mal', 'ostensibly male, unsure what that really means'
]

female_terms = [
    'female', 'f', 'cis female', 'woman', 'femail', 'femake', 'female (cis)', 'cis-female/femme', 
    'female (trans)', 'trans-female', 'woman'
]

other_terms = [
    'trans woman', 'male leaning androgynous', 'neuter', 'queer', 'enby', 'agender', 'something kinda male?',
    'non-binary', 'queer/she/they', 'androgyne', 'fluid', 'genderqueer', 'nah'
]

def clean_gender(gender):
    if gender in male_terms:
        return 'Male'
    elif gender in female_terms:
        return 'Female'
    elif gender in other_terms:
        return 'Other'


df['Gender'] = df['Gender'].apply(clean_gender)

In [12]:
#after that we take top 4 country and mark all other country as "other"
top_4 = ['United States', 'United Kingdom', 'Canada', 'Germany']

df['Country'] = df['Country'].apply(lambda x: x if x in top_4 else 'Other')

print(df['Country'].value_counts())

Country
United States     746
Other             204
United Kingdom    184
Canada             72
Germany            45
Name: count, dtype: int64


In [13]:
# Drop 'state' due to high missing values
df.drop(columns=['state'], inplace=True)

In [14]:
#To check there is not any 'NaN' values in the dataset
for col in df.columns:
    print(f"\n{col}: {df[col].unique()}")


Age: [37 44 32 31 33 35 39 42 23 29 36 27 46 41 34 30 40 38 50 24 18 28 26 22
 19 25 45 21 43 56 60 54 55 48 20 57 58 47 62 51 65 49 53 61 72]

Gender: ['Female' 'Male' 'Other' None]

Country: ['United States' 'Canada' 'United Kingdom' 'Other' 'Germany']

self_employed: ['No' 'Yes']

family_history: ['No' 'Yes']

treatment: ['Yes' 'No']

work_interfere: ['Often' 'Rarely' 'Never' 'Sometimes' 'Do Not Know']

no_employees: ['6-25' 'More than 1000' '26-100' '100-500' '1-5' '500-1000']

remote_work: ['No' 'Yes']

tech_company: ['Yes' 'No']

benefits: ['Yes' "Don't know" 'No']

care_options: ['Not sure' 'No' 'Yes']

wellness_program: ['No' "Don't know" 'Yes']

seek_help: ['Yes' "Don't know" 'No']

anonymity: ['Yes' "Don't know" 'No']

leave: ['Somewhat easy' "Don't know" 'Somewhat difficult' 'Very difficult'
 'Very easy']

mental_health_consequence: ['No' 'Maybe' 'Yes']

phys_health_consequence: ['No' 'Yes' 'Maybe']

coworkers: ['Some of them' 'No' 'Yes']

supervisor: ['Yes' 'No' 'Some of t

In [15]:
# Identify categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()

In [18]:
# Create a copy of the dataset
df_encoded = df.copy()
from sklearn.preprocessing import LabelEncoder



In [19]:
le = LabelEncoder()
for col in cat_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [20]:
# Preview encoded data
df_encoded.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,0,4,0,0,1,2,4,0,1,...,2,2,1,1,1,2,1,0,2,0
1,44,1,4,0,0,0,3,5,0,0,...,0,0,0,1,0,0,1,1,0,0
2,32,1,0,0,0,0,3,4,0,1,...,0,1,1,1,2,2,2,2,1,0
3,31,1,3,0,1,1,2,2,0,1,...,1,1,2,2,1,0,0,0,1,1
4,31,1,4,0,0,0,1,1,1,1,...,0,0,1,1,1,2,2,2,0,0


In [22]:
X = df_encoded.drop('treatment', axis=1)
y = df_encoded['treatment']
# Train a random forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.featureimportances
}).sort_values(by='Importance', ascending=False)

AttributeError: 'RandomForestClassifier' object has no attribute 'featureimportances'