In [24]:
import numpy as np
import pandas as pd
df_2016= pd.read_csv("survey_2016.csv")

df_2016.columns


Index(['Are you self-employed?',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health concerns and options for seeking help?',
       'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?',
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:',
       'Do you think that dis

In [25]:

# Dictionary mapping old column names to new column names
column_mapping = {
    'Are you self-employed?': 'self_employed',
    'How many employees does your company or organization have?': 'no_employees',
    'Is your employer primarily a tech company/organization?': 'tech_company',
    'Is your primary role within your company related to tech/IT?': 'tech_role',
    'Does your employer provide mental health benefits as part of healthcare coverage?': 'benefits',
    'Do you know the options for mental health care available under your employer-provided coverage?': 'care_options',
    'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': 'wellness_program',
    'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': 'seek_help',
    'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?': 'anonymity',
    'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': 'medical_leave',
    'Do you think that discussing a mental health disorder with your employer would have negative consequences?': 'mental_health_consequence',
    'Do you think that discussing a physical health issue with your employer would have negative consequences?': 'phys_health_consequence',
    'Would you feel comfortable discussing a mental health disorder with your coworkers?': 'coworkers',
    'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?': 'supervisor',
    'Do you feel that your employer takes mental health as seriously as physical health?': 'mental_vs_physical',
    'Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?': 'obs_consequence',
    'Do you have medical coverage (private insurance or state-provided) which includes treatment of  mental health issues?': 'mental_health_coverage',
    'Do you know local or online resources to seek help for a mental health disorder?': 'local_resources',
    'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?': 'reveal_to_clients',
    'If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?': 'client_impact',
    'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?': 'reveal_to_coworkers',
    'If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?': 'coworker_impact',
    'Do you believe your productivity is ever affected by a mental health issue?': 'productivity_impact',
    'If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?': 'work_time_impact',
    'Do you have previous employers?': 'previous_employers',
    'Have your previous employers provided mental health benefits?': 'previous_benefits',
    'Were you aware of the options for mental health care provided by your previous employers?': 'previous_care_options',
    'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?': 'previous_wellness_program',
    'Did your previous employers provide resources to learn more about mental health issues and how to seek help?': 'previous_seek_help',
    'Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?': 'previous_anonymity',
    'Do you think that discussing a mental health disorder with previous employers would have negative consequences?': 'previous_mental_health_consequence',
    'Do you think that discussing a physical health issue with previous employers would have negative consequences?': 'previous_phys_health_consequence',
    'Would you have been willing to discuss a mental health issue with your previous co-workers?': 'previous_coworkers',
    'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?': 'previous_supervisor',
    'Did you feel that your previous employers took mental health as seriously as physical health?': 'previous_mental_vs_physical',
    'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?': 'previous_obs_consequence',
    'Would you be willing to bring up a physical health issue with a potential employer in an interview?': 'phys_health_interview',
    'Why or why not?': 'phys_health_interview_reason',
    'Would you bring up a mental health issue with a potential employer in an interview?': 'mental_health_interview',
    'Why or why not?.1': 'mental_health_interview_reason',
    'Do you feel that being identified as a person with a mental health issue would hurt your career?': 'mental_health_career_impact',
    'Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?': 'coworker_perception',
    'How willing would you be to share with friends and family that you have a mental illness?': 'share_with_family',
    'Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?': 'observed_bad_response',
    'Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?': 'observed_impact',
    'Do you have a family history of mental illness?': 'family_history',
    'Have you had a mental health disorder in the past?': 'past_mental_health_disorder',
    'Do you currently have a mental health disorder?': 'current_mental_health_disorder',
    'If yes, what condition(s) have you been diagnosed with?': 'current_conditions_diagnosed',
    'If maybe, what condition(s) do you believe you have?': 'suspected_conditions',
    'Have you been diagnosed with a mental health condition by a medical professional?': 'diagnosed_conditions',
    'If so, what condition(s) were you diagnosed with?': 'diagnosed_conditions_detail',
    'Have you ever sought treatment for a mental health issue from a mental health professional?': 'sought_treatment',
    'If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?': 'work_interfere_treated',
    'If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?': 'work_interfere_untreated',
    'What is your age?': 'age',
    'What is your gender?': 'sex',
    'What country do you live in?': 'country',
    'What US state or territory do you live in?': 'state',
    'What country do you work in?': 'work_country',
    'What US state or territory do you work in?': 'work_state',
    'Which of the following best describes your work position?': 'work_position',
    'Do you work remotely?': 'remote_work'
}

# Renaming the columns
df_2016 = df_2016.rename(columns=column_mapping)

# Now you can check the new column names
print(df_2016.columns)


Index(['self_employed', 'no_employees', 'tech_company', 'tech_role',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'medical_leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_vs_physical', 'obs_consequence',
       'Do you have medical coverage (private insurance or state-provided) which includes treatment of  mental health issues?',
       'local_resources', 'reveal_to_clients', 'client_impact',
       'reveal_to_coworkers', 'coworker_impact', 'productivity_impact',
       'work_time_impact', 'previous_employers', 'previous_benefits',
       'previous_care_options', 'previous_wellness_program',
       'previous_seek_help', 'previous_anonymity',
       'previous_mental_health_consequence',
       'previous_phys_health_consequence', 'previous_coworkers',
       'previous_supervisor', 'previous_mental_vs_physical',
       'previous_obs_consequence', 'phys_health_interview',
       'phy

In [26]:
df_2016.isna().sum()

self_employed       0
no_employees      287
tech_company      287
tech_role        1170
benefits          287
                 ... 
state             593
work_country        0
work_state        582
work_position       0
remote_work         0
Length: 63, dtype: int64

In [27]:
# work on: Age, Gender, Country, benefits, wellness_program, past_mental_health_disorder', current_mental_health_disorder', 'current_conditions_diagnosed', no_employees
#questions: % of females and males with mh disorder (present + past), most frequent country, what age?
# How it relates to the company size? more cases in bigger companies?
# do company offer treatment and benefits? Less cases?

In [28]:
df_2016["country"].value_counts()

country
United States of America    840
United Kingdom              180
Canada                       78
Germany                      58
Netherlands                  48
Australia                    35
Sweden                       19
France                       16
Ireland                      15
Switzerland                  10
Brazil                       10
Russia                        9
India                         9
New Zealand                   9
Bulgaria                      7
Finland                       7
Denmark                       7
Belgium                       5
Italy                         5
Poland                        4
Spain                         4
Austria                       4
South Africa                  4
Romania                       4
Chile                         3
Czech Republic                3
Pakistan                      3
Norway                        3
Lithuania                     2
Japan                         2
Mexico                        2


In [29]:
df_2016.country.isna().sum()

#Columns is clean from Nan and can be used for analysis

np.int64(0)

In [30]:
import pycountry
print(pycountry.countries)

<pycountry.ExistingCountries object at 0x11ac8aab0>


In [31]:
import pycountry

# List of European countries
european_countries = [
    country.name for country in pycountry.countries
    if country.alpha_2 in {
        'AL', 'AD', 'AM', 'AT', 'AZ', 'BY', 'BE', 'BA', 'BG', 'HR', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'GE', 'DE', 
        'GR', 'HU', 'IS', 'IE', 'IT', 'KZ', 'XK', 'LV', 'LI', 'LT', 'LU', 'MT', 'MD', 'MC', 'ME', 'NL', 'MK', 'NO', 
        'PL', 'PT', 'RO', 'RU', 'SM', 'RS', 'SK', 'SI', 'ES', 'SE', 'CH', 'UA', 'GB', 'VA'
    }
]

print(european_countries)

def grouped_countries(country):
    if country in european_countries:
        return "Europe"
    else:
        return country

df_2016["country"] = df_2016["country"].apply(grouped_countries)

df_2016["country"].value_counts()

#European countries grouped into Europe

['Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan', 'Belgium', 'Bulgaria', 'Bosnia and Herzegovina', 'Belarus', 'Switzerland', 'Cyprus', 'Czechia', 'Germany', 'Denmark', 'Spain', 'Estonia', 'Finland', 'France', 'United Kingdom', 'Georgia', 'Greece', 'Croatia', 'Hungary', 'Ireland', 'Iceland', 'Italy', 'Kazakhstan', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Latvia', 'Monaco', 'Moldova, Republic of', 'North Macedonia', 'Malta', 'Montenegro', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russian Federation', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Sweden', 'Ukraine', 'Holy See (Vatican City State)']


country
United States of America    840
Europe                      406
Canada                       78
Australia                    35
Brazil                       10
Russia                        9
India                         9
New Zealand                   9
South Africa                  4
Czech Republic                3
Chile                         3
Pakistan                      3
Colombia                      2
Mexico                        2
Other                         2
Israel                        2
Japan                         2
Afghanistan                   2
Vietnam                       1
Costa Rica                    1
Argentina                     1
Venezuela                     1
Brunei                        1
Algeria                       1
Bangladesh                    1
Iran                          1
Ecuador                       1
China                         1
Guatemala                     1
Taiwan                        1
Name: count, dtype: int64

In [32]:
df_2016.sex.isna().sum()
# columns has 3 Nan: drop them

np.int64(3)

In [33]:
df_2016.dropna(subset = ["sex"], inplace = True)
df_2016.sex.isna().sum()

#Nan removed from sex

np.int64(0)

In [34]:
df_2016["sex"].unique()

array(['Male', 'male', 'Male ', 'Female', 'M', 'female', 'm',
       'I identify as female.', 'female ', 'Bigender', 'non-binary',
       'Female assigned at birth ', 'F', 'Woman', 'man', 'fm', 'f',
       'Cis female ', 'Transitioned, M2F', 'Genderfluid (born female)',
       'Other/Transfeminine', 'Female or Multi-Gender Femme', 'Female ',
       'woman', 'female/woman', 'Cis male', 'Male.', 'Androgynous',
       'male 9:1 female, roughly', 'Male (cis)', 'Other', 'nb masculine',
       'Cisgender Female', 'Man', 'Sex is male', 'none of your business',
       'genderqueer', 'cis male', 'Human', 'Genderfluid', 'Enby', 'Malr',
       'genderqueer woman', 'mtf', 'Queer', 'Agender', 'Dude', 'Fluid',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'fem', 'Nonbinary', 'male ',
       'human', 'Female (props for making this a freefor

In [35]:
#Assigning F or M values to sex column:

stand_gender = {"MALE":"M", "Male":"M", "M|":"M", "Dude":"M", "m":"M", "male ":"M", "M":"M", "male":"M", "Make":"M", "cisdude":"M", "maile": "M", "Mal":"M", "Malr":"M", "Male.":"M", "Mail":"M", "cis man":"M", "cis male":"M", "Cis Man":"M", "msale":"M", "Androgyne":"M", "Male-ish":"M", "Man":"M", "Male (CIS)":"M", "Cis Male":"M", "Sex is male":"M",
                "F":"F", "female":"F", "Female":"F", "f":"F", "female/woman":"F", "fem":"F", "Femal":"F", "femail":"F", "I identify as female":"F", "Woman":"F", "Cisgender Female":"F","cis-female/femme":"F","Femake":"F", "woman":"F", "Cis-woman":"F", "Cis Female":"F", "Female assigned at birth": "F", "fm":"F", "fem":"F", "Female (props for making this a freeform field, though)":"F"}

df_2016["sex"] = df_2016["sex"].map(stand_gender).str.strip().str.upper()

df_2016["sex"].unique()
df_2016.sex.isna().sum()

#This generated 70 Nan: will drop them

np.int64(70)

In [36]:
df_2016.dropna(subset = ["sex"], inplace = True)
df_2016.sex.isna().sum()
#NAN removed

np.int64(0)

In [37]:
df_2016["sex"].value_counts()

sex
M    1038
F     322
Name: count, dtype: int64

In [38]:
df_2016.age.isna().sum()
#No NaN here

np.int64(0)

In [39]:
df_2016.age.unique()
#some values make no sense: 323, 3, 15: will drop them

array([ 39,  29,  43,  42,  30,  37,  44,  28,  34,  35,  52,  31,  26,
        25,  33,  38,  27,  36,  40,  41,  32,  45,  46,  19,  21,  24,
        17,  23,  22,  51,  48,  55,  50,  49,  20,  54,  47,  56,  57,
        63,  61, 323,  62,  53,  58,   3,  66,  59,  15,  65,  70])

In [40]:
df_2016 = df_2016[(df_2016['age'] >= 17) & (df_2016['age'] <= 70)]
df_2016.age.unique()

#Column is clean

array([39, 29, 43, 42, 30, 37, 44, 28, 34, 35, 52, 31, 26, 25, 33, 38, 27,
       36, 40, 41, 32, 45, 46, 19, 21, 24, 17, 23, 22, 51, 48, 55, 50, 49,
       20, 54, 47, 56, 57, 63, 61, 62, 53, 58, 66, 59, 65, 70])

In [41]:
df_2016["no_employees"].unique()

array(['26-100', '6-25', nan, 'More than 1000', '100-500', '500-1000',
       '1-5'], dtype=object)

In [42]:
df_2016['no_employees'].value_counts()

no_employees
26-100            279
More than 1000    243
100-500           235
6-25              199
500-1000           76
1-5                57
Name: count, dtype: int64

In [43]:
df_2016["no_employees"].isna().sum()

np.int64(268)

In [44]:
# Distribute the Nan into the different categories without affecting their proportions:

categories = df_2016['no_employees'].dropna().value_counts(normalize=True)

num_missing = df_2016['no_employees'].isna().sum()

# Generate missing values based on the proportions
missing_values = np.random.choice(
    categories.index,
    size=num_missing,
    p=categories.values
)

# Replace NaN values directly in the DataFrame
df_2016.loc[df_2016['no_employees'].isna(), 'no_employees'] = missing_values

# Check if there are any NaNs left
print(df_2016['no_employees'].unique())  # Should show no NaNs
print(df_2016['no_employees'].isna().sum())  # Should be 0


['26-100' '6-25' '100-500' 'More than 1000' '500-1000' '1-5']
0


In [45]:
df_2016['no_employees'].value_counts()

no_employees
26-100            351
More than 1000    307
100-500           291
6-25              248
500-1000           90
1-5                70
Name: count, dtype: int64

In [52]:
df_2016["current_mental_health_disorder"].value_counts()

current_mental_health_disorder
Yes      532
No       514
Maybe    311
Name: count, dtype: int64

In [53]:
df_2016["past_mental_health_disorder"].value_counts()

past_mental_health_disorder
Yes      683
No       438
Maybe    236
Name: count, dtype: int64

In [56]:
df_2016["diagnosed_conditions_detail"].unique()

array(['Anxiety Disorder (Generalized, Social, Phobia, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       nan,
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Substance Use Disorder',
       'Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Obsessive-Compulsive Disorder',
       'Mood Disorder (Depression, Bipolar Disorder, etc)|Attention Deficit Hyperactivity Disorder|Post-traumatic Stress Disorder',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)|Substance Use Disorder|Addictive Disorder',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Attention Deficit Hyperactivity Disorder',
       'Mood Disorder (Depression, Bipolar Disorder, etc)|Attention Deficit Hyperactivity Disorder',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, 

In [55]:
df_2016["diagnosed_conditions"].isna().sum()

np.int64(0)