# Data Cleaning 
### Mental Health Survey Data 2016-2021

In [1]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from subprocess import check_output
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
pd.options.mode.chained_assignment = None 

In [3]:
# import dataset
df = pd.read_csv("mental_health_data_2016-2021_cleaned_yes_no.csv")

In [4]:
# drop duplicate rows
arp_data = df.drop_duplicates()

In [5]:
arp_data

Unnamed: 0.1,Unnamed: 0,What is your age?,What is your gender?,Do you *currently* have a mental health disorder?,Have you ever been *diagnosed* with a mental health disorder?,Have you ever sought treatment for a mental health disorder from a mental health professional?,Do you have a family history of mental illness?,Have you had a mental health disorder in the past?
0,0,39.0,Male,No,Yes,0,No,Yes
1,2,38.0,Male,No,No,1,No,Maybe
2,6,30.0,M,No,No,0,No,No
3,17,34.0,Male,No,No,0,No,No
4,20,26.0,Male,No,No,0,Yes,Maybe
...,...,...,...,...,...,...,...,...
2403,3254,45.0,Male,Yes,Yes,0,Yes,Yes
2404,3255,30.0,Female,Yes,Yes,1,Yes,Yes
2405,3260,35.0,Male,Yes,Yes,1,I don't know,Possibly
2406,3264,33.0,Male,Yes,Yes,1,I don't know,Yes


In [6]:
arp_data.drop('Unnamed: 0', 1)

  arp_data.drop('Unnamed: 0', 1)


Unnamed: 0,What is your age?,What is your gender?,Do you *currently* have a mental health disorder?,Have you ever been *diagnosed* with a mental health disorder?,Have you ever sought treatment for a mental health disorder from a mental health professional?,Do you have a family history of mental illness?,Have you had a mental health disorder in the past?
0,39.0,Male,No,Yes,0,No,Yes
1,38.0,Male,No,No,1,No,Maybe
2,30.0,M,No,No,0,No,No
3,34.0,Male,No,No,0,No,No
4,26.0,Male,No,No,0,Yes,Maybe
...,...,...,...,...,...,...,...
2403,45.0,Male,Yes,Yes,0,Yes,Yes
2404,30.0,Female,Yes,Yes,1,Yes,Yes
2405,35.0,Male,Yes,Yes,1,I don't know,Possibly
2406,33.0,Male,Yes,Yes,1,I don't know,Yes


In [7]:
#Distribution of the data
print(arp_data.describe())
print(arp_data.shape)

# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

arp_data.head()


#Check for missing data
total = arp_data.isnull().sum().sort_values(ascending=False)
percent = (arp_data.isnull().sum()/arp_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

print(arp_data.info())

'''
#correlation matrix
corrmat =arp_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()
'''
'''
#MH6 correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'MH6')['MH6'].index
cm = np.corrcoef(arp_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
'''

        Unnamed: 0  What is your age?  \
count  2408.000000        2406.000000   
mean   1591.806063          34.808811   
std     941.018110          11.097265   
min       0.000000           0.000000   
25%     770.750000          28.000000   
50%    1565.000000          33.000000   
75%    2393.500000          39.000000   
max    3268.000000         323.000000   

       Have you ever sought treatment for a mental health disorder from a mental health professional?  
count                                        2408.000000                                               
mean                                            0.627492                                               
std                                             0.483573                                               
min                                             0.000000                                               
25%                                             0.000000                                               
50%   

"\n#MH6 correlation matrix\nk = 10 #number of variables for heatmap\ncols = corrmat.nlargest(k, 'MH6')['MH6'].index\ncm = np.corrcoef(arp_data[cols].values.T)\nsns.set(font_scale=1.25)\nhm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)\nplt.show()\n"

In [8]:
arp_data.isna().sum() 

Unnamed: 0                                                                                          0
What is your age?                                                                                   2
What is your gender?                                                                               22
Do you *currently* have a mental health disorder?                                                   0
Have you ever been *diagnosed* with a mental health disorder?                                     546
Have you ever sought treatment for a mental health disorder from a mental health professional?      0
Do you have a family history of mental illness?                                                     0
Have you had a mental health disorder in the past?                                                 23
dtype: int64

In [9]:
#Renaming columns
arp_data.rename(columns = {
    'What is your age?': 'Age',
    'What is your gender?': 'Sex', 
    'Do you *currently* have a mental health disorder?': 'Have_mental_illness',
    'Have you ever been *diagnosed* with a mental health disorder?': 'Mental_illness_diagnosis',
    'Have you ever sought treatment for a mental health disorder from a mental health professional?': 'Sought_mental_treatment',
    'Do you have a family history of mental illness?': 'Mental_illness_family_history',
    'Have you had a mental health disorder in the past?': 'Mental_disorder_in_the_past'
}, inplace = True)


In [10]:
arp_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past
0,0,39.0,Male,No,Yes,0,No,Yes
1,2,38.0,Male,No,No,1,No,Maybe
2,6,30.0,M,No,No,0,No,No
3,17,34.0,Male,No,No,0,No,No
4,20,26.0,Male,No,No,0,Yes,Maybe
...,...,...,...,...,...,...,...,...
2403,3254,45.0,Male,Yes,Yes,0,Yes,Yes
2404,3255,30.0,Female,Yes,Yes,1,Yes,Yes
2405,3260,35.0,Male,Yes,Yes,1,I don't know,Possibly
2406,3264,33.0,Male,Yes,Yes,1,I don't know,Yes


In [11]:
# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data type
intFeatures = ['Age', 'Sought_mental_treatment']
stringFeatures = ['Sex', 'Have_mental_illness', 'Mental_illness_diagnosis', 'Mental_illness_family_history', 'Mental_disorder_in_the_past']
floatFeatures = ['Age'] 

In [12]:
# Clean the NaN's
for feature in arp_data:
    if feature in intFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultString)
    elif feature in floatFeatures:
        arp_data[feature] = arp_data[feature].fillna(defaultFloat)
    else:
        print('Error: %s not recognized.' % feature)
arp_data.head()

Error: Unnamed: 0 not recognized.


Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past
0,0,39.0,Male,No,Yes,0,No,Yes
1,2,38.0,Male,No,No,1,No,Maybe
2,6,30.0,M,No,No,0,No,No
3,17,34.0,Male,No,No,0,No,No
4,20,26.0,Male,No,No,0,Yes,Maybe


In [13]:
#normalizing 'gender' values
gender = arp_data['Sex'].str.lower()
print(gender)

#Select unique values
gender = arp_data['Sex'].unique()

#grouping
male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in arp_data.iterrows():

    if str.lower(col.Sex) in male_str:
        arp_data['Sex'].replace(to_replace=col.Sex, value='male', inplace=True)

    if str.lower(col.Sex) in female_str:
        arp_data['Sex'].replace(to_replace=col.Sex, value='female', inplace=True)

    if str.lower(col.Sex) in trans_str:
        arp_data['Sex'].replace(to_replace=col.Sex, value='trans', inplace=True)
  
#getting unique values
stk_list = ['A little about you', 'p']
arp_data = arp_data[~arp_data['Sex'].isin(stk_list)]
arp_data['Sex'].unique()

0         male
1        male 
2            m
3         male
4         male
         ...  
2403      male
2404    female
2405      male
2406      male
2407      male
Name: Sex, Length: 2408, dtype: object


array(['male', 'female', 'fm', 'Genderfluid (born female)',
       'female/woman', 'Male.', 'Sex is male', 'Dude', 'M|', 'fem',
       ' Female', 'NaN',
       "male (hey this is the tech industry you're talking about)",
       'God King of the Valajar', 'cis male ', 'Contextual', 'Malel',
       'Male (or female, or both)', 'transgender',
       'Female/gender non-binary.', 'none', 'Cisgender male', 'SWM',
       'Identify as male', 'Masculine', 'I have a penis', 'femmina', '43',
       'Trans non-binary/genderfluid', 'Femile', 'cisgender male',
       'I identify as female.', 'Bigender', 'trans',
       'Female assigned at birth ', 'Cis female ', 'Transitioned, M2F',
       'Female or Multi-Gender Femme', 'Other', 'nb masculine',
       'Cisgender Female', 'Human', 'Genderfluid', 'genderqueer woman',
       'mtf', 'Male/genderqueer', 'Nonbinary', 'Unicorn',
       'Male (trans, FtM)', 'Cis-woman', 'cisdude',
       'Genderflux demi-girl', 'AFAB', 'Transgender woman', 'cis-male',
    

In [14]:
#filling missing values in Age column by taking median
arp_data['Age'].fillna(arp_data['Age'].median(), inplace = True)

# Fill with median() values < 18 and > 120
s = pd.Series(arp_data['Age'])
s[s<18] = arp_data['Age'].median()
arp_data['Age'] = s
s = pd.Series(arp_data['Age'])
s[s>120] = arp_data['Age'].median()
arp_data['Age'] = s

#Age groups
arp_data['age_range'] = pd.cut(arp_data['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

In [15]:
#Encoding data
labelDict = {}
for feature in arp_data:
    le = preprocessing.LabelEncoder()
    le.fit(arp_data[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    arp_data[feature] = le.transform(arp_data[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    
for key, value in labelDict.items():     
    print(key, value)

#Get rid of 'Country'
#arp_data = arp_data.drop(['Country'], axis= 1)
arp_data

label_Unnamed: 0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 30, 31, 33, 34, 35, 36, 39, 40, 41, 42, 43, 44, 45, 46, 49, 50, 51, 52, 53, 55, 59, 62, 64, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 87, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 107, 108, 109, 111, 113, 114, 115, 116, 117, 118, 121, 122, 123, 124, 127, 128, 129, 132, 133, 134, 135, 137, 138, 139, 140, 141, 143, 144, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 190, 192, 194, 195, 196, 197, 198, 199, 201, 203, 205, 206, 207, 208, 209, 210, 211, 212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 237, 239, 241, 242, 243, 244, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 257, 260, 261, 262, 263, 265, 266, 267, 268, 269, 270, 271, 2

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past,age_range
0,0,20,81,0,2,0,1,5,2
1,2,19,81,0,1,1,1,1,2
2,6,11,81,0,1,0,1,3,1
3,16,15,81,0,1,0,1,3,2
4,19,7,81,0,1,0,2,1,1
...,...,...,...,...,...,...,...,...,...
2403,2397,26,81,1,2,0,2,5,2
2404,2398,11,72,1,2,1,2,5,1
2405,2401,16,81,1,2,1,0,4,2
2406,2404,14,81,1,2,1,0,5,2


In [16]:
#checking for null values
total = arp_data.isnull().sum().sort_values(ascending=False)
percent = (arp_data.isnull().sum()/arp_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

                               Total  Percent
Unnamed: 0                         0      0.0
Age                                0      0.0
Sex                                0      0.0
Have_mental_illness                0      0.0
Mental_illness_diagnosis           0      0.0
Sought_mental_treatment            0      0.0
Mental_illness_family_history      0      0.0
Mental_disorder_in_the_past        0      0.0
age_range                          0      0.0


In [17]:
arp_data.isna().sum()

Unnamed: 0                       0
Age                              0
Sex                              0
Have_mental_illness              0
Mental_illness_diagnosis         0
Sought_mental_treatment          0
Mental_illness_family_history    0
Mental_disorder_in_the_past      0
age_range                        0
dtype: int64

In [59]:
arp_data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Have_mental_illness,Mental_illness_diagnosis,Sought_mental_treatment,Mental_illness_family_history,Mental_disorder_in_the_past,age_range
0,0,20,81,0,2,0,1,5,2
1,2,19,81,0,1,1,1,1,2
2,6,11,81,0,1,0,1,3,1
3,16,15,81,0,1,0,1,3,2
4,19,7,81,0,1,0,2,1,1
...,...,...,...,...,...,...,...,...,...
2402,2391,17,47,1,2,1,2,5,2
2403,2397,26,81,1,2,0,2,5,2
2404,2398,11,72,1,2,1,2,5,1
2405,2401,16,81,1,2,1,0,4,2


In [60]:
arp_data['Sought_mental_treatment'].value_counts()

1    1342
0     475
Name: Sought_mental_treatment, dtype: int64

In [57]:
# Drop 'dont know' and 'NAN' values from each column
arp_data = arp_data[arp_data['Mental_disorder_in_the_past'] != 0]
arp_data = arp_data[arp_data['Mental_disorder_in_the_past'] != 2]
arp_data = arp_data[arp_data['Mental_illness_diagnosis'] != 0]
arp_data = arp_data[arp_data['Sought_mental_treatment'] != 0]



In [18]:
# save cleaned data to csv
arp_data.to_csv("mental_health_data_2016-2021_ready_for_ML.csv")