# Data Cleaning 
### Mental Health Survey Data 2016-2021

Refrence : https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

In [10]:
#installing packages for ETL processes
import pandas as pd #for manupulating dataframes for analysis
import matplotlib.pyplot as plt #for numerical analysis
import seaborn as sns #for visulaisation
import numpy as np #mathematical functions and managing large dataset
from subprocess import check_output
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
pd.options.mode.chained_assignment = None 

In [11]:
# import dataset
df = pd.read_csv("./combined_surveys_2016-2021/mental_health_data_2016-2021.csv")

In [12]:
# drop duplicate rows
arp_data = df.drop_duplicates()

In [13]:
#Viewing dataset
arp_data

Unnamed: 0.1,Unnamed: 0,age,sex,self-employed,working_country,living_country,mental_illness_family_history,have_mental_illness,mental_disorder_in_the_past,number_of_employees_your_company_has,mental_health_benefits_from_previous_employers,mental_illness_diagnosis,mental_health_benefits_from_current_employer,discussing_mental_health_with_coworkers,sought_mental_treatment,discussing_mental_health_with_supervisor,mental_illness_medical_coverage
0,0,39.0,Male,0,United Kingdom,United Kingdom,No,No,Yes,26-100,"No, none did",Yes,Not eligible for coverage / N/A,Maybe,0,Yes,
1,1,29.0,male,0,United States of America,United States of America,Yes,Yes,Yes,6-25,"Yes, they all did",Yes,No,Maybe,1,Yes,
2,2,38.0,Male,0,United Kingdom,United Kingdom,No,No,Maybe,6-25,"No, none did",No,No,Maybe,1,Maybe,
3,3,43.0,male,1,United Kingdom,United Kingdom,No,Yes,Yes,,Some did,Yes,,,1,,1.0
4,4,43.0,Female,0,United States of America,United States of America,Yes,Yes,Yes,6-25,I don't know,Yes,Yes,Maybe,1,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,126,33.0,Male,0,Germany,Germany,I don't know,Yes,Yes,26-100,"No, none did",Yes,No,No,1,Yes,
3265,127,49.0,Male,0,Portugal,Portugal,Yes,No,Possibly,100-500,"No, none did",,No,Maybe,0,Maybe,
3266,128,28.0,,1,Switzerland,Pakistan,No,No,Don't Know,,,,,,0,,0.0
3267,129,26.0,Male,1,India,India,No,Don't Know,Don't Know,,,,,,0,,0.0


In [14]:
#dropping unnamed column 
arp_data.drop('Unnamed: 0', 1)

  arp_data.drop('Unnamed: 0', 1)


Unnamed: 0,age,sex,self-employed,working_country,living_country,mental_illness_family_history,have_mental_illness,mental_disorder_in_the_past,number_of_employees_your_company_has,mental_health_benefits_from_previous_employers,mental_illness_diagnosis,mental_health_benefits_from_current_employer,discussing_mental_health_with_coworkers,sought_mental_treatment,discussing_mental_health_with_supervisor,mental_illness_medical_coverage
0,39.0,Male,0,United Kingdom,United Kingdom,No,No,Yes,26-100,"No, none did",Yes,Not eligible for coverage / N/A,Maybe,0,Yes,
1,29.0,male,0,United States of America,United States of America,Yes,Yes,Yes,6-25,"Yes, they all did",Yes,No,Maybe,1,Yes,
2,38.0,Male,0,United Kingdom,United Kingdom,No,No,Maybe,6-25,"No, none did",No,No,Maybe,1,Maybe,
3,43.0,male,1,United Kingdom,United Kingdom,No,Yes,Yes,,Some did,Yes,,,1,,1.0
4,43.0,Female,0,United States of America,United States of America,Yes,Yes,Yes,6-25,I don't know,Yes,Yes,Maybe,1,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,33.0,Male,0,Germany,Germany,I don't know,Yes,Yes,26-100,"No, none did",Yes,No,No,1,Yes,
3265,49.0,Male,0,Portugal,Portugal,Yes,No,Possibly,100-500,"No, none did",,No,Maybe,0,Maybe,
3266,28.0,,1,Switzerland,Pakistan,No,No,Don't Know,,,,,,0,,0.0
3267,26.0,Male,1,India,India,No,Don't Know,Don't Know,,,,,,0,,0.0


In [16]:
#Distribution of the data
print(arp_data.describe())
print(arp_data.shape)

# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

#Returns the first few rows from dataframe
arp_data.head()


#Check for missing data
total = arp_data.isnull().sum().sort_values(ascending=False)
percent = (arp_data.isnull().sum()/arp_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

#overview of dataset
print(arp_data.info())

'''
#correlation matrix
corrmat =arp_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()
'''
'''
#MH6 correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'MH6')['MH6'].index
cm = np.corrcoef(arp_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
'''

        Unnamed: 0          age  self-employed  sought_mental_treatment
count  3269.000000  3267.000000    3269.000000              3269.000000
mean    454.131233    34.580961       0.167024                 0.587641
std     387.473648    10.441880       0.373054                 0.492334
min       0.000000     0.000000       0.000000                 0.000000
25%     137.000000    28.000000       0.000000                 0.000000
50%     330.000000    33.000000       0.000000                 1.000000
75%     685.000000    39.000000       0.000000                 1.000000
max    1432.000000   323.000000       1.000000                 1.000000
(3269, 17)
                                                Total   Percent
mental_illness_medical_coverage                  2723  0.832976
mental_illness_diagnosis                         1080  0.330376
number_of_employees_your_company_has              546  0.167024
discussing_mental_health_with_supervisor          546  0.167024
discussing_mental_hea

"\n#MH6 correlation matrix\nk = 10 #number of variables for heatmap\ncols = corrmat.nlargest(k, 'MH6')['MH6'].index\ncm = np.corrcoef(arp_data[cols].values.T)\nsns.set(font_scale=1.25)\nhm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)\nplt.show()\n"

In [17]:
#checking for missing data
arp_data.isna().sum() 

Unnamed: 0                                           0
age                                                  2
sex                                                 28
self-employed                                        0
working_country                                      2
living_country                                       2
mental_illness_family_history                        0
have_mental_illness                                  0
mental_disorder_in_the_past                         23
number_of_employees_your_company_has               546
mental_health_benefits_from_previous_employers     459
mental_illness_diagnosis                          1080
mental_health_benefits_from_current_employer       546
discussing_mental_health_with_coworkers            546
sought_mental_treatment                              0
discussing_mental_health_with_supervisor           546
mental_illness_medical_coverage                   2723
dtype: int64

In [38]:
arp_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past
0,0,39.0,Male,No,Yes,0,No,Yes
1,1,29.0,male,Yes,Yes,1,Yes,Yes
2,2,38.0,Male,No,No,1,No,Maybe
3,3,43.0,male,Yes,Yes,1,No,Yes
4,4,43.0,Female,Yes,Yes,1,Yes,Yes
...,...,...,...,...,...,...,...,...
3264,126,33.0,Male,Yes,Yes,1,I don't know,Yes
3265,127,49.0,Male,No,,0,Yes,Possibly
3266,128,28.0,,No,,0,No,Don't Know
3267,129,26.0,Male,Don't Know,,0,No,Don't Know


In [39]:
# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data type
intFeatures = ['age', 'sought_mental_treatment', 'self-employed', 'mental_illness_medical_coverage']
stringFeatures = ['sex', 'have_mental_illness', 'mental_illness_diagnosis', 'mental_disorder_in_the_past', 'working_country', 'living_country', 'mental_illness_family_history', 'number_of_employees_your_company_has', 'mental_health_benefits_from_previous_employers', 'mental_health_benefits_from_current_employer', 'discussing_mental_health_with_coworkers', 'discussing_mental_health_with_supervisor']
floatFeatures = ['age'] 

In [40]:
# Clean the NaN's
for feature in arp_data:
    if feature in intFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultString)
    elif feature in floatFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultFloat)
    else:
        print('Error: %s not recognized.' % feature)
arp_data.head()

Error: Unnamed: 0 not recognized.


Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past
0,0,39.0,Male,No,Yes,0,No,Yes
1,1,29.0,male,Yes,Yes,1,Yes,Yes
2,2,38.0,Male,No,No,1,No,Maybe
3,3,43.0,male,Yes,Yes,1,No,Yes
4,4,43.0,Female,Yes,Yes,1,Yes,Yes


In [41]:
#normalizing 'gender' values
gender = arp_data['sex'].str.lower()
print(gender)

#Select unique values
gender = arp_data['sex'].unique()

#grouping
male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male", "male.", "sex is male", "dude", 
"i'm a man why didn't you make this a drop down question. you should of asked sex? and I would of answered yes please. seriously how much text can this take? ", "m|", 
"male (hey this is the tech industry you're talking about)","ostensibly male", 'male, born with xy chromosoms','malel','let\'s keep it simple and say "male"', 'identify as male',
'masculine','i have a penis', 'masculino']
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", 
"guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means", "bigender", "trans", "transitioned, m2f", "genderfluid (born female)", 
"other/transfeminine", "androgynous", "male 9:1 female, roughly", "other", "nb masculine", "cisgender female", "none of your business", "human", "genderfluid", "genderqueer woman", "mtf", 
"male/genderqueer", "nonbinary", "unicorn", "male (trans, ftm)", 'cis-woman', 'cisdude', 'genderflux demi-girl', 'female-bodied; no feelings about gender','afab', 'transgender woman', 
'cis-male', 'male/androgynous ','cis hetero male', 'uhhhhhhhhh fem genderqueer?', 'god king of the valajar', 'cis-male','male, cis', 'cis male ', 'agender/genderfluid', 'sometimes',
'female (cisgender)', 'female (cis) ', 'contextual', 'non binary', 'genderqueer demigirl', 'genderqueer/non-binary', 'cis-female', 'nonbinary', 'f, cisgender', 'female-ish', '\\-', 
'transfeminine', 'none','male (or female, or both)', 'trans man', 'transgender','non binary', 'female/gender non-binary.', 'cis woman','female (cisgender)', 'cis-female', 'cisgendered woman',
'genderfluid', 'demiguy', 'none', 'trans female', 'cisgender male','she/her/they/them', 'swm', 'cisgender female', 'nb','nonbinary/femme', 'gender non-conforming woman', 'cishet male', 
'female-identified', 'questioning','rr', 'cis woman', 'agender trans woman','femmina', '43','i am a wookie','trans non-binary/genderfluid', 'non-binary and gender fluid', 'mostly male', 
'cisgender male','afab non-binary', 'b', 'male/he/him', 'homem cis', 'cis-het male','non-binary/agender']   
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail", "i identify as female.", "female assigned at birth ", "fm", 
"female or multi-gender femme", "female/woman", "fem", "female (props for making this a freeform field, though)", " female", 'femalw', 'my sex is female.', 'woman-identified', 
'i identify as female','shrug emoji (f)','femile', 'female, she/her']

for (row, col) in arp_data.iterrows():

    if str.lower(col.sex) in male_str:
        arp_data['sex'].replace(to_replace=col.sex, value='male', inplace=True)

    if str.lower(col.sex) in female_str:
        arp_data['sex'].replace(to_replace=col.sex, value='female', inplace=True)

    if str.lower(col.sex) in trans_str:
        arp_data['sex'].replace(to_replace=col.sex, value='trans', inplace=True)

#getting unique values
stk_list = ['A little about you', 'p']
arp_data = arp_data[~arp_data['sex'].isin(stk_list)]
arp_data['sex'].unique()

0         male
1         male
2        male 
3         male
4       female
         ...  
3264      male
3265      male
3266       nan
3267      male
3268      male
Name: Sex, Length: 3265, dtype: object


array(['male', 'female', 'I identify as female.', 'Bigender', 'trans',
       'Female assigned at birth ', 'fm', 'Cis female ',
       'Transitioned, M2F', 'Genderfluid (born female)',
       'Other/Transfeminine', 'Female or Multi-Gender Femme',
       'female/woman', 'Male.', 'Androgynous', 'male 9:1 female, roughly',
       'NaN', 'Other', 'nb masculine', 'Cisgender Female', 'Sex is male',
       'none of your business', 'Human', 'Genderfluid',
       'genderqueer woman', 'mtf', 'Dude',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'M|', 'Male/genderqueer', 'fem', 'Nonbinary', 'human',
       'Female (props for making this a freeform field, though)',
       ' Female', 'Unicorn', 'Male (trans, FtM)', 'Cis-woman', 'cisdude',
       'Genderflux demi-girl', 'female-bodied; no feelings about gender',
       'AFAB', 'Transgender woman', 'cis-male', 'male/androgy

In [42]:
#filling missing values in Age column by taking median
arp_data['age'].fillna(arp_data['age'].median(), inplace = True)

# Fill with median() values < 18 and > 120
s = pd.Series(arp_data['age'])
s[s<18] = arp_data['age'].median()
arp_data['age'] = s
s = pd.Series(arp_data['age'])
s[s>120] = arp_data['age'].median()
arp_data['age'] = s

#Age groups
arp_data['age_range'] = pd.cut(arp_data['age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

In [43]:
#Encoding data
labelDict = {}
oneHotEncodedDict = {}
oneHotEncodeNeededColumns = ['sought_mental_treatment', 'mental_illness_medical_coverage', 'mental_illness_diagnosis', 'mental_health_benefits_from_previous_employers', 'mental_health_benefits_from_current_employer', 'discussing_mental_health_with_coworkers', 'discussing_mental_health_with_supervisor']

for feature in arp_data:
    le = preprocessing.LabelEncoder()
    le.fit(arp_data[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    arp_data[feature] = le.transform(arp_data[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    if feature in oneHotEncodeNeededColumns:
        oneHotEncodedDict[feature] = pd.get_dummies(arp_data[feature]).values
for key, value in labelDict.items():     
    print(key, value)
print(oneHotEncodedDict)

#Get rid of 'Country'
#arp_data = arp_data.drop(['Country'], axis= 1)
arp_data

label_Unnamed: 0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 21

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past,age_range
0,0,21,103,2,2,0,1,5,2
1,1,11,103,4,2,1,2,5,1
2,2,20,103,2,1,1,1,1,2
3,3,25,103,4,2,1,1,5,2
4,4,25,90,4,2,1,2,5,2
...,...,...,...,...,...,...,...,...,...
3264,126,15,103,4,2,1,0,5,2
3265,127,31,103,2,0,0,2,4,2
3266,128,10,54,2,0,0,1,0,1
3267,129,8,103,0,0,0,1,0,1


In [44]:
#checking for null values
total = arp_data.isnull().sum().sort_values(ascending=False)
percent = (arp_data.isnull().sum()/arp_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

                               Total  Percent
Unnamed: 0                         0      0.0
Age                                0      0.0
Sex                                0      0.0
Have_mental_illness                0      0.0
Mental_illness_diagnosis           0      0.0
Sought_mental_treatment            0      0.0
Mental_illness_family_history      0      0.0
Mental_disorder_in_the_past        0      0.0
age_range                          0      0.0


In [45]:
arp_data.isna().sum()

Unnamed: 0                       0
Age                              0
Sex                              0
Have_mental_illness              0
Mental_illness_diagnosis         0
Sought_mental_treatment          0
Mental_illness_family_history    0
Mental_disorder_in_the_past      0
age_range                        0
dtype: int64

In [50]:
arp_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past,age_range
0,0,21,103,2,2,0,1,5,2
1,1,11,103,4,2,1,2,5,1
2,2,20,103,2,1,1,1,1,2
3,3,25,103,4,2,1,1,5,2
4,4,25,90,4,2,1,2,5,2
...,...,...,...,...,...,...,...,...,...
3264,126,15,103,4,2,1,0,5,2
3265,127,31,103,2,0,0,2,4,2
3266,128,10,54,2,0,0,1,0,1
3267,129,8,103,0,0,0,1,0,1


In [47]:
arp_data['sought_mental_treatment'].value_counts()

1    1917
0    1348
Name: Sought_mental_treatment, dtype: int64

In [54]:
# Drop 'dont know' and 'NAN' values from each column
#arp_data = arp_data[arp_data['Mental_disorder_in_the_past'] != 0]
#arp_data = arp_data[arp_data['Mental_disorder_in_the_past'] != 2]
#arp_data = arp_data[arp_data['Mental_illness_diagnosis'] != 0]
#arp_data = arp_data[arp_data['Sought_mental_treatment'] != 0]
#arp_data = arp_data[arp_data['Mental_illness_family_history'] != 0]
arp_data = arp_data[arp_data['have_mental_illness'] != 0]
arp_data = arp_data[arp_data['have_mental_illness'] != 1]
arp_data = arp_data[arp_data['have_mental_illness'] != 3]

In [55]:
print(arp_data['mental_illness_family_history'].value_counts())
print(arp_data['sought_mental_treatment'].value_counts())
print(arp_data['have_mental_illness'].value_counts())
print(arp_data['mental_disorder_in_the_past'].value_counts())
print(arp_data['mental_illness_diagnosis'].value_counts())

2    1159
1     820
0     425
Name: Mental_illness_family_history, dtype: int64
1    1507
0     897
Name: Sought_mental_treatment, dtype: int64
4    1327
2    1077
Name: Have_mental_illness, dtype: int64
5    1283
3     819
4     153
1      91
0      35
2      23
Name: Mental_disorder_in_the_past, dtype: int64
2    1321
0     546
1     537
Name: Mental_illness_diagnosis, dtype: int64


In [56]:
# save cleaned data to csv
arp_data.to_csv("./cleaned_data/mental_health_data_2016-2021_ready_for_ML.csv")