In [41]:
import pandas as pd
import numpy as np

In [42]:
raw_data = pd.read_csv("../data/raw/mental-heath-in-tech-2016_20161114.csv", encoding="utf-8")
feature_names = pd.read_csv("../data/processed/features_list.csv", encoding="utf-8")
feature_names.head(2)

Unnamed: 0,questions,variables,variables2,variables3
0,Are you self-employed?,self_employed,Are you self-employed?,Self employed
1,How many employees does your company or organi...,num_employees,How many employees?,Number of employees


In [43]:
old_names = list(feature_names["questions"])

In [44]:
raw_data[old_names]

KeyError: "['Do you have medical coverage (private insurance or state-provided) which includes treatment of ¬†mental health issues?'] not in index"

In [45]:
for idx, col in enumerate(raw_data.columns):
    if col.startswith("Do you have medical coverage (private insurance or state-provided) which includes treatment of"):
        print(idx)

16


In [46]:
for idx, col in enumerate(old_names):
    if col.startswith("Do you have medical coverage (private insurance or state-provided) which includes treatment of"):
        print(idx)

27


In [47]:
old_names[27]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of ¬†mental health issues?'

In [48]:
raw_data.columns[16]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?'

In [49]:
old_names[27] = raw_data.columns[16]

In [50]:
old_names[27]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?'

In [51]:
raw_data_col = raw_data[old_names]

In [52]:
feature_map = {old_names[i]:feature_names.iloc[i, 1] for i in range(len(old_names))}

In [53]:
feature_map2 = {old_names[i]:feature_names.iloc[i, 2] for i in range(len(old_names))}

In [54]:
feature_map

{'Are you self-employed?': 'self_employed',
 'How many employees does your company or organization have?': 'num_employees',
 'Is your employer primarily a tech company/organization?': 'tech_org',
 'Does your employer provide mental health benefits as part of healthcare coverage?': 'mental_health_benefits_healthcare',
 'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': 'mental_health_resources',
 'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': 'mental_health_leave',
 'Do you think that discussing a mental health disorder with your employer would have negative consequences?': 'mental_disorder_discuss',
 'Do you think that discussing a physical health issue with your employer would have negative consequences?': 'health_disorder_discuss',
 'Would you feel comfortable discussing a mental health disorder with your coworkers?': 'discuss_coworker',
 'Would you feel comfort

In [55]:
feature_map2

{'Are you self-employed?': 'Are you self-employed?',
 'How many employees does your company or organization have?': 'How many employees?',
 'Is your employer primarily a tech company/organization?': 'Is employer primarily a tech company?',
 'Does your employer provide mental health benefits as part of healthcare coverage?': 'Are mental health benefits provided as part of healthcare?',
 'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': 'Does your employer provide mental health resources?',
 'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': 'How difficult would it be to ask for mental health leave?',
 'Do you think that discussing a mental health disorder with your employer would have negative consequences?': 'Discussing mental health issues have negative consequences?',
 'Do you think that discussing a physical health issue with your employer would have negative cons

In [56]:
clean_data = raw_data_col.rename(columns=feature_map)

In [57]:
clean_data_questions = raw_data_col.rename(columns=feature_map2)

In [58]:
clean_data.head(2)

Unnamed: 0,self_employed,num_employees,tech_org,mental_health_benefits_healthcare,mental_health_resources,mental_health_leave,mental_disorder_discuss,health_disorder_discuss,discuss_coworker,discuss_supervisor,...,country,is_remote,tech_role,mental_health_benefits_employer,formal_discuss,anonymity,mental_vs_physical,medical_coverage,career_effect,family_history
0,0,26-100,1.0,Not eligible for coverage / N/A,No,Very easy,No,No,Maybe,Yes,...,United Kingdom,Sometimes,,,No,I don't know,I don't know,,Maybe,No
1,0,6-25,1.0,No,Yes,Somewhat easy,No,No,Maybe,Yes,...,United States of America,Never,,Yes,Yes,Yes,Yes,,"No, I don't think it would",Yes


In [59]:
clean_data_questions.head(2)

Unnamed: 0,Are you self-employed?,How many employees?,Is employer primarily a tech company?,Are mental health benefits provided as part of healthcare?,Does your employer provide mental health resources?,How difficult would it be to ask for mental health leave?,Discussing mental health issues have negative consequences?,Discussing physical health issues have negative consequences?,Comfortable discussing mental health issues with co-workers?,Comfortable discussing mental health issues with your supervisor?,...,What country do you live in?,Do you work remotely?,Is your primary role within your company related to tech/IT?,Aware of mental health options care under employer-provided coverage?,Has your employer ever formally discussed mental health?,Is anonymity by employer protected for treatment of mental health issues?,Do employer treat mental health and physical health equally?,Do you have private/state medical coverage for mental health issues?,Does Identified as having mental health issue would hurt career?,Do you have a family history of mental illness?
0,0,26-100,1.0,Not eligible for coverage / N/A,No,Very easy,No,No,Maybe,Yes,...,United Kingdom,Sometimes,,,No,I don't know,I don't know,,Maybe,No
1,0,6-25,1.0,No,Yes,Somewhat easy,No,No,Maybe,Yes,...,United States of America,Never,,Yes,Yes,Yes,Yes,,"No, I don't think it would",Yes


In [60]:
clean_data.shape

(1433, 30)

In [61]:
clean_data_questions.shape

(1433, 30)

## Clean up `gender` column:
- all similiar to male as male
- all similiar to female as female
- all others as other
- only 3 'nulls', for simplicity to other

In [62]:
clean_data['gender'].unique()

array(['Male', 'male', 'Male ', 'Female', 'M', 'female', 'm',
       'I identify as female.', 'female ', 'Bigender', 'non-binary',
       'Female assigned at birth ', 'F', 'Woman', 'man', 'fm', 'f',
       'Cis female ', 'Transitioned, M2F', 'Genderfluid (born female)',
       'Other/Transfeminine', 'Female or Multi-Gender Femme', 'Female ',
       'woman', 'female/woman', 'Cis male', 'Male.', 'Androgynous',
       'male 9:1 female, roughly', nan, 'Male (cis)', 'Other',
       'nb masculine', 'Cisgender Female', 'Man', 'Sex is male',
       'none of your business', 'genderqueer', 'cis male', 'Human',
       'Genderfluid', 'Enby', 'Malr', 'genderqueer woman', 'mtf', 'Queer',
       'Agender', 'Dude', 'Fluid',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'fem', 'Nonbinary', 'male ',
       'human', 'Female (props for making th

In [63]:
clean_data.loc[clean_data['gender'].str.contains(r'(^\s*[Ff]emale)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Ww]oman)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Ff]$)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ale)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ALE)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]$)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]an)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ail)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^nan)', na=False),'gender'] = np.NaN
clean_data.loc[clean_data['gender'].str.contains('M\|', na=False),'gender'] = "Male"


#all remaining as `Other`
clean_data.loc[clean_data['gender'].str.contains(r'[^Male|Female]', na=False),'gender'] = "Other"
clean_data.loc[clean_data['gender'].isnull(),'gender'] = "Other"
#check output contains 3 possible values
clean_data['gender'].unique()

  return func(self, *args, **kwargs)


array(['Male', 'Female', 'Other'], dtype=object)

In [64]:

clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^\s*[Ff]emale)', na=False),'What is your gender?'] = "Female"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Ww]oman)', na=False),'What is your gender?'] = "Female"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Ff]$)', na=False),'What is your gender?'] = "Female"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Mm]ale)', na=False),'What is your gender?'] = "Male"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Mm]ALE)', na=False),'What is your gender?'] = "Male"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Mm]$)', na=False),'What is your gender?'] = "Male"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Mm]an)', na=False),'What is your gender?'] = "Male"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^[Mm]ail)', na=False),'What is your gender?'] = "Male"
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'(^nan)', na=False),'What is your gender?'] = np.NaN
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains('M\|', na=False),'What is your gender?'] = "Male"

#all remaining as `Other`
clean_data_questions.loc[clean_data_questions['What is your gender?'].str.contains(r'[^Male|Female]', na=False),'What is your gender?'] = "Other"
clean_data_questions.loc[clean_data_questions['What is your gender?'].isnull(),'What is your gender?'] = "Other"
#check output contains 3 possible values
clean_data_questions['What is your gender?'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

Re-word response category in `mental_health_benefits_healthcare` question to shorten:

In [65]:
clean_data.loc[clean_data['mental_health_benefits_healthcare'].str.contains('Not eligible for coverage / N/A', na=False),'mental_health_benefits_healthcare'] = "Not eligible for coverage"
clean_data_questions.loc[clean_data_questions['Are mental health benefits provided as part of healthcare?'].str.contains('Not eligible for coverage / N/A', na=False),
                         'Are mental health benefits provided as part of healthcare?'] = "Not eligible for coverage"

In [66]:
clean_data

Unnamed: 0,self_employed,num_employees,tech_org,mental_health_benefits_healthcare,mental_health_resources,mental_health_leave,mental_disorder_discuss,health_disorder_discuss,discuss_coworker,discuss_supervisor,...,country,is_remote,tech_role,mental_health_benefits_employer,formal_discuss,anonymity,mental_vs_physical,medical_coverage,career_effect,family_history
0,0,26-100,1.0,Not eligible for coverage,No,Very easy,No,No,Maybe,Yes,...,United Kingdom,Sometimes,,,No,I don't know,I don't know,,Maybe,No
1,0,6-25,1.0,No,Yes,Somewhat easy,No,No,Maybe,Yes,...,United States of America,Never,,Yes,Yes,Yes,Yes,,"No, I don't think it would",Yes
2,0,6-25,1.0,No,No,Neither easy nor difficult,Maybe,No,Maybe,Maybe,...,United Kingdom,Always,,,No,I don't know,I don't know,,Maybe,No
3,1,,,,,,,,,,...,United Kingdom,Sometimes,,,,,,1.0,"Yes, I think it would",No
4,0,6-25,0.0,Yes,No,Neither easy nor difficult,Yes,Maybe,Maybe,No,...,United States of America,Sometimes,1.0,Yes,No,No,No,,"Yes, I think it would",Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428,1,,,,,,,,,,...,United States of America,Sometimes,,,,,,1.0,Maybe,Yes
1429,1,,,,,,,,,,...,United States of America,Sometimes,,,,,,1.0,"No, it has not",Yes
1430,0,100-500,1.0,Yes,Yes,Somewhat difficult,Maybe,Maybe,Yes,Yes,...,United States of America,Sometimes,,Yes,Yes,I don't know,I don't know,,"Yes, it has",Yes
1431,0,100-500,0.0,I don't know,Yes,Somewhat difficult,Maybe,No,Maybe,Yes,...,United States of America,Sometimes,1.0,I am not sure,No,I don't know,No,,"No, I don't think it would",Yes


In [67]:
clean_data_reformat = clean_data.replace({1:"Yes", 0:"No"})
clean_data_reformat

Unnamed: 0,self_employed,num_employees,tech_org,mental_health_benefits_healthcare,mental_health_resources,mental_health_leave,mental_disorder_discuss,health_disorder_discuss,discuss_coworker,discuss_supervisor,...,country,is_remote,tech_role,mental_health_benefits_employer,formal_discuss,anonymity,mental_vs_physical,medical_coverage,career_effect,family_history
0,No,26-100,Yes,Not eligible for coverage,No,Very easy,No,No,Maybe,Yes,...,United Kingdom,Sometimes,,,No,I don't know,I don't know,,Maybe,No
1,No,6-25,Yes,No,Yes,Somewhat easy,No,No,Maybe,Yes,...,United States of America,Never,,Yes,Yes,Yes,Yes,,"No, I don't think it would",Yes
2,No,6-25,Yes,No,No,Neither easy nor difficult,Maybe,No,Maybe,Maybe,...,United Kingdom,Always,,,No,I don't know,I don't know,,Maybe,No
3,Yes,,,,,,,,,,...,United Kingdom,Sometimes,,,,,,Yes,"Yes, I think it would",No
4,No,6-25,No,Yes,No,Neither easy nor difficult,Yes,Maybe,Maybe,No,...,United States of America,Sometimes,Yes,Yes,No,No,No,,"Yes, I think it would",Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428,Yes,,,,,,,,,,...,United States of America,Sometimes,,,,,,Yes,Maybe,Yes
1429,Yes,,,,,,,,,,...,United States of America,Sometimes,,,,,,Yes,"No, it has not",Yes
1430,No,100-500,Yes,Yes,Yes,Somewhat difficult,Maybe,Maybe,Yes,Yes,...,United States of America,Sometimes,,Yes,Yes,I don't know,I don't know,,"Yes, it has",Yes
1431,No,100-500,No,I don't know,Yes,Somewhat difficult,Maybe,No,Maybe,Yes,...,United States of America,Sometimes,Yes,I am not sure,No,I don't know,No,,"No, I don't think it would",Yes


In [68]:
clean_data_reformat.to_csv("../data/processed/mental_health_clean_reformat.csv", index=0)

In [69]:
#Check:
clean_data['mental_health_benefits_healthcare'].unique()

array(['Not eligible for coverage', 'No', nan, 'Yes', "I don't know"],
      dtype=object)

In [70]:
clean_data.to_csv("../data/processed/mental_health_clean.csv", index=0)

In [71]:
# clean_data_questions.to_csv("../data/processed/mental_health_clean2.csv", index=0)

In [8]:
data_check2 = pd.read_csv("../data/processed/mental_health_clean2.csv")

In [9]:
data_check = pd.read_csv("../data/processed/mental_health_clean.csv")

In [10]:
data_check.shape

(1433, 30)

In [11]:
data_check2.shape

(1433, 30)

In [12]:
clean_data_questions

NameError: name 'clean_data_questions' is not defined