In [253]:
import pandas as pd
import numpy as np

In [254]:
raw_data = pd.read_csv("../data/raw/mental-heath-in-tech-2016_20161114.csv", encoding="utf-8")
feature_names = pd.read_csv("../data/processed/features_list.csv", encoding="utf-8")
feature_names.head(2)

Unnamed: 0,questions,variables
0,Are you self-employed?,self_employed
1,How many employees does your company or organi...,num_employees


In [255]:
old_names = list(feature_names["questions"])

In [256]:
raw_data[old_names]

KeyError: "['Do you have medical coverage (private insurance or state-provided) which includes treatment of ¬†mental health issues?'] not in index"

In [257]:
for idx, col in enumerate(raw_data.columns):
    if col.startswith("Do you have medical coverage (private insurance or state-provided) which includes treatment of"):
        print(idx)

16


In [258]:
for idx, col in enumerate(old_names):
    if col.startswith("Do you have medical coverage (private insurance or state-provided) which includes treatment of"):
        print(idx)

27


In [259]:
old_names[27]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of ¬†mental health issues?'

In [260]:
raw_data.columns[16]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?'

In [261]:
old_names[27] = raw_data.columns[16]

In [262]:
old_names[27]

'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?'

In [263]:
raw_data_col = raw_data[old_names]

In [264]:
feature_map = {old_names[i]:feature_names.iloc[i, 1] for i in range(len(old_names))}

In [265]:
feature_map

{'Are you self-employed?': 'self_employed',
 'How many employees does your company or organization have?': 'num_employees',
 'Is your employer primarily a tech company/organization?': 'tech_org',
 'Does your employer provide mental health benefits as part of healthcare coverage?': 'mental_health_benefits_healthcare',
 'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': 'mental_health_resources',
 'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': 'mental_health_leave',
 'Do you think that discussing a mental health disorder with your employer would have negative consequences?': 'mental_disorder_discuss',
 'Do you think that discussing a physical health issue with your employer would have negative consequences?': 'health_disorder_discuss',
 'Would you feel comfortable discussing a mental health disorder with your coworkers?': 'discuss_coworker',
 'Would you feel comfort

In [266]:
clean_data = raw_data_col.rename(columns=feature_map)

In [267]:
clean_data.head(2)

Unnamed: 0,self_employed,num_employees,tech_org,mental_health_benefits_healthcare,mental_health_resources,mental_health_leave,mental_disorder_discuss,health_disorder_discuss,discuss_coworker,discuss_supervisor,...,country,is_remote,tech_role,mental_health_benefits_employer,formal_discuss,anonymity,mental_vs_physical,medical_coverage,career_effect,family_history
0,0,26-100,1.0,Not eligible for coverage / N/A,No,Very easy,No,No,Maybe,Yes,...,United Kingdom,Sometimes,,,No,I don't know,I don't know,,Maybe,No
1,0,6-25,1.0,No,Yes,Somewhat easy,No,No,Maybe,Yes,...,United States of America,Never,,Yes,Yes,Yes,Yes,,"No, I don't think it would",Yes


In [268]:
clean_data.shape

(1433, 30)

## Clean up `gender` column:
- all similiar to male as male
- all similiar to female as female
- all others as other

In [269]:
clean_data['gender'].unique()

array(['Male', 'male', 'Male ', 'Female', 'M', 'female', 'm',
       'I identify as female.', 'female ', 'Bigender', 'non-binary',
       'Female assigned at birth ', 'F', 'Woman', 'man', 'fm', 'f',
       'Cis female ', 'Transitioned, M2F', 'Genderfluid (born female)',
       'Other/Transfeminine', 'Female or Multi-Gender Femme', 'Female ',
       'woman', 'female/woman', 'Cis male', 'Male.', 'Androgynous',
       'male 9:1 female, roughly', nan, 'Male (cis)', 'Other',
       'nb masculine', 'Cisgender Female', 'Man', 'Sex is male',
       'none of your business', 'genderqueer', 'cis male', 'Human',
       'Genderfluid', 'Enby', 'Malr', 'genderqueer woman', 'mtf', 'Queer',
       'Agender', 'Dude', 'Fluid',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'fem', 'Nonbinary', 'male ',
       'human', 'Female (props for making th

In [270]:
clean_data.loc[clean_data['gender'].str.contains(r'(^\s*[Ff]emale)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Ww]oman)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Ff]$)', na=False),'gender'] = "Female"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ale)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ALE)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]$)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]an)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^[Mm]ail)', na=False),'gender'] = "Male"
clean_data.loc[clean_data['gender'].str.contains(r'(^nan)', na=False),'gender'] = np.NaN
clean_data.loc[clean_data['gender'].str.contains('M\|', na=False),'gender'] = "Male"

#all remaining as `Other`
clean_data.loc[clean_data['gender'].str.contains(r'[^Male|Female]', na=False),'gender'] = "Other"
#check output contains 3 possible values
clean_data['gender'].unique()

  return func(self, *args, **kwargs)


array(['Male', 'Female', 'Other', nan], dtype=object)

In [21]:
clean_data.to_csv("../data/processed/mental_health_clean.csv", index=0)

In [22]:
data_check = pd.read_csv("../data/processed/mental_health_clean.csv")

In [26]:
data_check.shape

(1433, 30)