Project 1 - Mental Health in Tech

By: Matthew Idle, Chad Fletcher, Lori Vitaioli, Brady Ogega, Cindy Hansel, Heather Shoberg, Jennifer White

In [21]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from pprint import pprint
from pathlib import Path

# File locations
survey_2014 = "Resources/2014-survey.csv"
survey_2016 = "Resources/2016-survey.csv"
survey_2017 = "Resources/2017-survey.csv"
survey_2018 = "Resources/2018-survey.csv"
survey_2019 = "Resources/2019-survey.csv"

# Reading the data
s_2014_data = pd.read_csv(survey_2014)
s_2016_data = pd.read_csv(survey_2016)
s_2017_data = pd.read_csv(survey_2017)
s_2018_data = pd.read_csv(survey_2018)
s_2019_data = pd.read_csv(survey_2019)


In [22]:
# retrieve desired columns from "narrowed" 2016
#desired_column_list = narrowed_2016_data.columns
desired_column_list = ['What is your age?', 'What is your gender?',
       'What country do you work in?', 'Do you work remotely?',
       'How many employees does your company or organization have?',
       'Have you had a mental health disorder in the past?',
       'Do you currently have a mental health disorder?',
       'Would you be willing to bring up a physical health issue with a potential employer in an interview?',
       'Would you bring up a mental health issue with a potential employer in an interview?',
       'Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?',
       'Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?',
       'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?',
       'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?',
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:']

# create a new dataframe for further cleaning
narrowed_2016 = s_2016_data[desired_column_list]
narrowed_2016.head()

Unnamed: 0,What is your age?,What is your gender?,What country do you work in?,Do you work remotely?,How many employees does your company or organization have?,Have you had a mental health disorder in the past?,Do you currently have a mental health disorder?,Would you be willing to bring up a physical health issue with a potential employer in an interview?,Would you bring up a mental health issue with a potential employer in an interview?,Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?,Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?,Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?,Would you have been willing to discuss a mental health issue with your direct supervisor(s)?,"If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:"
0,39,Male,United Kingdom,Sometimes,26-100,Yes,No,Maybe,Maybe,No,,Yes,Some of my previous employers,Very easy
1,29,male,United States of America,Never,6-25,Yes,Yes,Maybe,No,No,,Yes,Some of my previous employers,Somewhat easy
2,38,Male,United Kingdom,Always,6-25,Maybe,No,Yes,Yes,Maybe/Not sure,Yes,Maybe,I don't know,Neither easy nor difficult
3,43,male,United Kingdom,Sometimes,,Yes,Yes,Yes,Maybe,No,,,Some of my previous employers,
4,43,Female,United States of America,Sometimes,6-25,Yes,Yes,Maybe,No,"Yes, I experienced",Yes,No,Some of my previous employers,Neither easy nor difficult


In [23]:
# change column names
new_column_names = ['age',  
            'original_gender',  
            'country',  
            'remote',  
            'num_employees',  
            'disorder_past',  
            'disorder_current', 
            'bring_up_phys_issue_in_interview', 
            'bring_up_mh_issue_in_interview', 
            'obs_exper_bad_resp_cur_prev', 
            'obs_of_other_inhibit_discuss_curr', 
            'comf_discuss_mh_disorder_cur_superv', 
            'willing_discuss_mh_issue_prev_superv', 
            'request_med_leave']
narrowed_2016.columns = new_column_names
narrowed_2016.head()

Unnamed: 0,age,original_gender,country,remote,num_employees,disorder_past,disorder_current,bring_up_phys_issue_in_interview,bring_up_mh_issue_in_interview,obs_exper_bad_resp_cur_prev,obs_of_other_inhibit_discuss_curr,comf_discuss_mh_disorder_cur_superv,willing_discuss_mh_issue_prev_superv,request_med_leave
0,39,Male,United Kingdom,Sometimes,26-100,Yes,No,Maybe,Maybe,No,,Yes,Some of my previous employers,Very easy
1,29,male,United States of America,Never,6-25,Yes,Yes,Maybe,No,No,,Yes,Some of my previous employers,Somewhat easy
2,38,Male,United Kingdom,Always,6-25,Maybe,No,Yes,Yes,Maybe/Not sure,Yes,Maybe,I don't know,Neither easy nor difficult
3,43,male,United Kingdom,Sometimes,,Yes,Yes,Yes,Maybe,No,,,Some of my previous employers,
4,43,Female,United States of America,Sometimes,6-25,Yes,Yes,Maybe,No,"Yes, I experienced",Yes,No,Some of my previous employers,Neither easy nor difficult


In [24]:
########## clean gender column

# find unique values from free-form answers
gender_list = narrowed_2016["original_gender"].unique()
gender_list

array(['Male', 'male', 'Male ', 'Female', 'M', 'female', 'm',
       'I identify as female.', 'female ', 'Bigender', 'non-binary',
       'Female assigned at birth ', 'F', 'Woman', 'man', 'fm', 'f',
       'Cis female ', 'Transitioned, M2F', 'Genderfluid (born female)',
       'Other/Transfeminine', 'Female or Multi-Gender Femme', 'Female ',
       'woman', 'female/woman', 'Cis male', 'Male.', 'Androgynous',
       'male 9:1 female, roughly', nan, 'Male (cis)', 'Other',
       'nb masculine', 'Cisgender Female', 'Man', 'Sex is male',
       'none of your business', 'genderqueer', 'cis male', 'Human',
       'Genderfluid', 'Enby', 'Malr', 'genderqueer woman', 'mtf', 'Queer',
       'Agender', 'Dude', 'Fluid',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'fem', 'Nonbinary', 'male ',
       'human', 'Female (props for making th

In [25]:
# create gender categories of male, female, other_varied, non_answer
# (note that this does not fill in blanks)
male_list = ['Male',
             'male',
             'Male ',
             'M',
             'm',
             'man',
             'Cis male',
             'Male.',
             'Male (cis)',
             'Man',
             'Sex is male',
             'cis male',
             'Malr',
             'Dude', 
             "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
             'mail',
             'male ',
             'Cis Male',
             'cisdude',
             'cis man', 
             'MALE']

female_list =['Female', 
              'female', 
              'I identify as female.', 
              'female ',
              'Female assigned at birth ', 
              'F', 
              'Woman', 
              'f',
              'Cis female ', 
              'Female ',
              'woman', 
              'female/woman',
              'Cisgender Female', 
              'fem', 
              'Female (props for making this a freeform field, though)',
              ' Female', 
              'Cis-woman', 
              'female-bodied; no feelings about gender', 
              'AFAB',
              'fm'  ]

other_gender_list = ['Bigender',  
                     'non-binary',  
                     'Transitioned, M2F',  
                     'Genderfluid (born female)',  
                     'Other/Transfeminine', 
                     'Female or Multi-Gender Femme',  
                     'Androgynous',
                     'male 9:1 female, roughly', 
                     'Other', 
                     'nb masculine',  
                     'genderqueer', 
                     'Human', 
                     'Genderfluid', 
                     'Enby', 
                     'genderqueer woman', 
                     'mtf', 
                     'Queer',
                     'Agender', 
                     'Fluid', 
                     'Male/genderqueer', 
                     'Nonbinary', 
                     'human', 
                     'Unicorn', 
                     'Male (trans, FtM)', 
                     'Genderqueer', 
                     'Genderflux demi-girl',
                     'Transgender woman',]

no_answer_list = ['nan', 'none of your business',  'M|']

# check that all values are accounted for with counts
print(f"There are", len(gender_list), "unique original gender values")
print(f"There are", len(male_list), "unique male values")
print(f"There are", len(female_list), "unique female values")
print(f"There are", len(other_gender_list), "unique other gender values")
print(f"There are", len(no_answer_list), "unique no answer values")

if len(gender_list) != len(male_list) + len(female_list) + len(other_gender_list) + len(no_answer_list):
    print('Counts do not add up - check gender lists')
else:
    print("Counts look good")


There are 71 unique original gender values
There are 21 unique male values
There are 20 unique female values
There are 27 unique other gender values
There are 3 unique no answer values
Counts look good


In [26]:
# create new column called 'gender' and assign values based on the gender categories
narrowed_2016.insert(loc=2, column='gender', value='')

# create a list for new gender values
new_gender_list = []
for gender in narrowed_2016["original_gender"]:
    if gender in male_list:
        new_gender_list.append("male")
    elif gender in female_list:
        new_gender_list.append("female")
    elif gender in other_gender_list:
        new_gender_list.append("other")
    else:
        new_gender_list.append("")       

# copy df and assign values to the gender column
clean_2016 = pd.DataFrame(narrowed_2016).copy()
clean_2016['gender'] = new_gender_list

print(clean_2016['gender'].unique())
clean_2016.head()

['male' 'female' 'other' '']


Unnamed: 0,age,original_gender,gender,country,remote,num_employees,disorder_past,disorder_current,bring_up_phys_issue_in_interview,bring_up_mh_issue_in_interview,obs_exper_bad_resp_cur_prev,obs_of_other_inhibit_discuss_curr,comf_discuss_mh_disorder_cur_superv,willing_discuss_mh_issue_prev_superv,request_med_leave
0,39,Male,male,United Kingdom,Sometimes,26-100,Yes,No,Maybe,Maybe,No,,Yes,Some of my previous employers,Very easy
1,29,male,male,United States of America,Never,6-25,Yes,Yes,Maybe,No,No,,Yes,Some of my previous employers,Somewhat easy
2,38,Male,male,United Kingdom,Always,6-25,Maybe,No,Yes,Yes,Maybe/Not sure,Yes,Maybe,I don't know,Neither easy nor difficult
3,43,male,male,United Kingdom,Sometimes,,Yes,Yes,Yes,Maybe,No,,,Some of my previous employers,
4,43,Female,female,United States of America,Sometimes,6-25,Yes,Yes,Maybe,No,"Yes, I experienced",Yes,No,Some of my previous employers,Neither easy nor difficult


In [27]:
# Remove any ages below 18 and above 98
clean_2016 = clean_2016.loc[(clean_2016["age"] >= 18) & (clean_2016["age"] <= 98), :]
clean_2016["age"].value_counts()

age
30    94
31    82
29    79
35    74
28    74
32    72
33    69
34    69
26    64
27    63
37    59
39    55
38    54
36    50
25    44
24    42
40    36
22    32
44    31
43    30
42    29
45    27
41    24
23    24
46    22
21    15
47    14
49    13
55    12
48     9
50     9
51     7
54     7
52     7
20     6
56     5
19     4
63     4
57     4
53     3
61     2
59     2
62     1
58     1
66     1
65     1
74     1
70     1
Name: count, dtype: int64

In [28]:
# Create bins for ages
bins = [0,24,34,44,54,64,74,75]
age_groups = ["18-24","25-34","35-44","45-54","55-64","65-74","75+"]

# Slice the data and place it into bins
age_groups_column = pd.cut(clean_2016["age"], bins, labels=age_groups)

In [29]:
#Add a new age groups column
clean_2016["age groups"] = age_groups_column
clean_2016.head()

Unnamed: 0,age,original_gender,gender,country,remote,num_employees,disorder_past,disorder_current,bring_up_phys_issue_in_interview,bring_up_mh_issue_in_interview,obs_exper_bad_resp_cur_prev,obs_of_other_inhibit_discuss_curr,comf_discuss_mh_disorder_cur_superv,willing_discuss_mh_issue_prev_superv,request_med_leave,age groups
0,39,Male,male,United Kingdom,Sometimes,26-100,Yes,No,Maybe,Maybe,No,,Yes,Some of my previous employers,Very easy,35-44
1,29,male,male,United States of America,Never,6-25,Yes,Yes,Maybe,No,No,,Yes,Some of my previous employers,Somewhat easy,25-34
2,38,Male,male,United Kingdom,Always,6-25,Maybe,No,Yes,Yes,Maybe/Not sure,Yes,Maybe,I don't know,Neither easy nor difficult,35-44
3,43,male,male,United Kingdom,Sometimes,,Yes,Yes,Yes,Maybe,No,,,Some of my previous employers,,35-44
4,43,Female,female,United States of America,Sometimes,6-25,Yes,Yes,Maybe,No,"Yes, I experienced",Yes,No,Some of my previous employers,Neither easy nor difficult,35-44


In [36]:
# Fill blank or nan cells with "No Response"
clean_2016 = clean_2016.fillna(
    {
        "num_employees": 'No Response',
        "gender": 'No Response',
        "obs_exper_bad_resp_cur_prev": 'No Response',
        "obs_of_other_inhibit_discuss_curr": 'No Response',
        "comf_discuss_mh_disorder_cur_superv": 'No Response',
        "willing_discuss_mh_issue_prev_superv": 'No Response',
        "request_med_leave": 'No Response'
    }
)
clean_2016.count()

age                                     1428
original_gender                         1425
gender                                  1428
country                                 1428
remote                                  1428
num_employees                           1428
disorder_past                           1428
disorder_current                        1428
bring_up_phys_issue_in_interview        1428
bring_up_mh_issue_in_interview          1428
obs_exper_bad_resp_cur_prev             1428
obs_of_other_inhibit_discuss_curr       1428
comf_discuss_mh_disorder_cur_superv     1428
willing_discuss_mh_issue_prev_superv    1428
request_med_leave                       1428
age groups                              1428
dtype: int64

In [37]:
# export clean_2016 as a new csv file
filepath = Path('Resources/2016-cleaned.csv')
clean_2016.to_csv(filepath)