In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn import mixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
#set environment variables
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
sns.set_palette('Set2')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df_original = pd.read_csv("mental-heath-in-tech-2016_20161114.csv")
df_work = df_original

In [None]:
#datatypes
df_work.dtypes.to_frame().groupby(0).size().to_frame().rename(columns={0:'count'}).plot.bar()

In [3]:
# rename columns

li_or_col = list(df_work.columns)

di_col = {
'Are you self-employed?':'self-employed',
'How many employees does your company or organization have?':'num_employees',
'Is your employer primarily a tech company/organization?':'tech_comp',
'Is your primary role within your company related to tech/IT?':'tech_role',
'Does your employer provide mental health benefits as part of healthcare coverage?':'emp_benefits',
'Do you know the options for mental health care available under your employer-provided coverage?':'emp_know_options',
'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?':'emp_discussed',
'Does your employer offer resources to learn more about mental health concerns and options for seeking help?':'emp_ressources',
'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?':'emp_anonymity',
'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:':'emp_medicalleave',
'Do you think that discussing a mental health disorder with your employer would have negative consequences?':'emp_disc_mhdisorder_negcons',
'Do you think that discussing a physical health issue with your employer would have negative consequences?':'emp_disc_mhissue_negcons',
'Would you feel comfortable discussing a mental health disorder with your coworkers?':'emp_disc_mhdisorder_coworkers',
'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?':'emp_disc_mhdisorder_supervisor',
'Do you feel that your employer takes mental health as seriously as physical health?':'emp_mh_ph_serious',
'Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?':'emp_observed_negcons',
'Do you have medical coverage (private insurance or state-provided) which includes treatment of mental health issues?':'se_coverage_mh',
'Do you know local or online resources to seek help for a mental health disorder?':'se_ressources',
'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?':'se_reveal_clients',
'If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?':'se_reveal_clients_negcons',
'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?':'se_reveal_coworkers',
'If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?':'se_reveal_coworkers_negcons',
'Do you believe your productivity is ever affected by a mental health issue?':'productivity_affected',
'If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?':'productivity_affected_percentage',
'Do you have previous employers?':'prev_emp',
'Have your previous employers provided mental health benefits?':'prevemp_benefits',
'Were you aware of the options for mental health care provided by your previous employers?':'prevemp_know_options',
'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?':'prevemp_discussed',
'Did your previous employers provide resources to learn more about mental health issues and how to seek help?':'prevemp_ressources',
'Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?':'prevemp_anonymity',
'Do you think that discussing a mental health disorder with previous employers would have negative consequences?':'prevemp_mh_disc_negcons',
'Do you think that discussing a physical health issue with previous employers would have negative consequences?':'prevemp_ph_disc_negcons',
'Would you have been willing to discuss a mental health issue with your previous co-workers?':'prevemp_disc_mhissues_coworkers',
'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?':'prevemp_disc_mhissues_supervisor',
'Did you feel that your previous employers took mental health as seriously as physical health?':'prevemp_mh_ph_serious',
'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?':'prevemp_negcons',
'Would you be willing to bring up a physical health issue with a potential employer in an interview?':'ph_jobinterview',
'Why or why not?':'why',
'Would you bring up a mental health issue with a potential employer in an interview?':'mh_jobinterview',
'Why or why not?.1':'why.1',
'Do you feel that being identified as a person with a mental health issue would hurt your career?':'mh_career',
'Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?':'mh_coworkers_perspective_neg',
'How willing would you be to share with friends and family that you have a mental illness?':'mh_sharewfriends',
'Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?':'mhi_badresponse_work',
'Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?':'observations_otherindividual_less_likely_reveal',
'Do you have a family history of mental illness?':'family_history',
'Have you had a mental health disorder in the past?':'mhdisorder_past',
'Do you currently have a mental health disorder?':'mhdisorder_now',
'If yes, what condition(s) have you been diagnosed with?':'mhdisorder_what_diagnosed',
'If maybe, what condition(s) do you believe you have?':'mhdisorder_what_believe',
'Have you been diagnosed with a mental health condition by a medical professional?':'mh_diagnosed_medprof',
'If so, what condition(s) were you diagnosed with?':'medprof_what_diagnosed',
'Have you ever sought treatment for a mental health issue from a mental health professional?':'sought_treatment',
'If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?':'mhi_treated_inferencework',
'If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?':'mhi_NOTtreated_inferencework',
'What is your age?':'age',
'What is your gender?':'gender',
'What country do you live in?':'country_live',
'What US state or territory do you live in?':'usstate_live',
'What country do you work in?':'country_work',
'What US state or territory do you work in?':'usstate_work',
'Which of the following best describes your work position?':'work_position',
'Do you work remotely?':'remotework'
}

df_work.rename(columns=di_col, inplace=True)
df_work.columns

Index(['self-employed', 'num_employees', 'tech_comp', 'tech_role',
       'emp_benefits', 'emp_know_options', 'emp_discussed', 'emp_ressources',
       'emp_anonymity', 'emp_medicalleave', 'emp_disc_mhdisorder_negcons',
       'emp_disc_mhissue_negcons', 'emp_disc_mhdisorder_coworkers',
       'emp_disc_mhdisorder_supervisor', 'emp_mh_ph_serious',
       'emp_observed_negcons', 'se_coverage_mh', 'se_ressources',
       'se_reveal_clients', 'se_reveal_clients_negcons', 'se_reveal_coworkers',
       'se_reveal_coworkers_negcons', 'productivity_affected',
       'productivity_affected_percentage', 'prev_emp', 'prevemp_benefits',
       'prevemp_know_options', 'prevemp_discussed', 'prevemp_ressources',
       'prevemp_anonymity', 'prevemp_mh_disc_negcons',
       'prevemp_ph_disc_negcons', 'prevemp_disc_mhissues_coworkers',
       'prevemp_disc_mhissues_supervisor', 'prevemp_mh_ph_serious',
       'prevemp_negcons', 'ph_jobinterview', 'why', 'mh_jobinterview', 'why.1',
       'mh_career', 

**Let us start with some exploration of the data set.**

In [None]:
print(df_work.shape)

In [None]:
#some exploration
print(df_work.head())
print(df_work.describe())
print(df_work.describe(include=['O']))
print(df_work.columns)
print(df_work.dtypes)
print(df_work.dtypes.to_frame().value_counts())

In [None]:
#exploring nan values
print(df_work.isnull().sum())
print(df_work.isnull().sum().sum())
print(df_work.isnull().sum().sum()/df_work.size*100,'%')

li_percentages = []

for col in df_work.columns:
    print(col, ':', df_work[col].isnull().sum(), ':', df_work[col].isnull().sum()/df_work[col].size*100,'%')
    li_percentages.append(df_work[col].isnull().sum()/df_work[col].size*100)
    print('')

In [None]:
#bin the percentages
li_percentages_bins = pd.cut(li_percentages, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

ax = li_percentages_bins.value_counts().plot.bar(rot=0, color="b", figsize=(6,4))
plt.title('Percentage of NaN values per column')
plt.xlabel('Percentage of NaN values')
plt.ylabel('Number of columns')
plt.show()


In [None]:
#list of columns with value counts including nan values
for col in df_work.columns:
    print(col)
    print(df_work[col].value_counts(dropna=False))
    print('')

In [None]:
#list columns with more than 80% of nan values
for col in df_work.columns:
    if df_work[col].isnull().sum()/df_work[col].size*100 > 80:
        print(col) 

In [None]:
#value count on the whole dataframe
print(df_work.nunique(dropna=False).sum())

df_work.nunique(dropna=False).to_frame().describe()

In [None]:
#exploratory plots of emp-fields
for col in df_work.columns:
    if col.startswith('emp_'):
        plt.figure(figsize=(10,5))
        sns.countplot(x=col, data=df_work)
        plt.title(col)
        plt.show()

In [None]:
#exploratory plots of se-fields
for col in df_work.columns:
    if col.startswith('se_'):
        plt.figure(figsize=(10,5))
        sns.countplot(x=col, data=df_work)
        plt.title(col)
        plt.show()

In [None]:
#exploratory plots of prevemp-fields
for col in df_work.columns:
    if col.startswith('prev'):
        plt.figure(figsize=(10,5))
        sns.countplot(x=col, data=df_work)
        plt.title(col)
        plt.show()

In [None]:
print(df_work['mhdisorder_now'].value_counts(normalize=True))
print(df_work['mhdisorder_past'].value_counts(normalize=True))

#plot mhdisorder_now and mhdisorder_past
fig, ax = plt.subplots(1, 2, figsize=(20,10))
sns.countplot(x='mhdisorder_now', data=df_work, ax=ax[0])
sns.countplot(x='mhdisorder_past', data=df_work, ax=ax[1])
plt.savefig('mhdisorder_now_past.png')
plt.show()




In [None]:
#compare top 5 countries work and live with most respondents
print('Country live')
print(df_work['country_live'].value_counts().head(5))
print(' ')
print('Country work')
print(df_work['country_work'].value_counts().head(5))

# plot top 5 countries live against countries work in grouped bar chart
df_work['country_live'].value_counts().head(5).plot(kind='bar', color='blue', alpha=0.5, label='live')
df_work['country_work'].value_counts().head(5).plot(kind='bar', color='green', alpha=0.5, label='work')
plt.legend(loc='upper right')
plt.title('Top 5 countries live vs work')
plt.show()

**After some exploratory analysis, we will now start to clean the columns**

In [None]:
#as we are investigating regarding empolyees, we will drop all rows where self-employed is yes
print(df_work.shape)
df_work = df_work.loc[df_work['self-employed'] == 0]
df_work.drop(columns=['self-employed'], inplace=True)
print(df_work.shape)

In [None]:
#detect columns with more than 70% of nan values and drop these columns
for col in df_work.columns:
    if df_work[col].isnull().sum()/df_work[col].size*100 > 70:
        print(col)
        df_work.drop(col, axis=1, inplace=True)

In [None]:
#Check if variables are dependent, so one can be removed 
li_test_pairs = [
                ["country_live","country_work"],
                ["usstate_live","usstate_work"],
                ["medprof_what_diagnosed","mhdisorder_what_diagnosed"],
                ["medprof_what_diagnosed","mh_diagnosed_medprof"],
                ["emp_disc_mhdisorder_negcons","emp_disc_mhissue_negcons"],
                ["emp_disc_mhdisorder_coworkers","emp_disc_mhissue_negcons"],
                ["emp_disc_mhdisorder_supervisor","emp_disc_mhissue_negcons"],
                ["emp_observed_negcons","emp_disc_mhissue_negcons"],
                ["prevemp_ph_disc_negcons","prevemp_mh_disc_negcons"],
                ["prevemp_disc_mhissues_coworkers","prevemp_mh_disc_negcons"],
                ["prevemp_disc_mhissues_supervisor","prevemp_mh_disc_negcons"],
                ["prevemp_negcons","prevemp_mh_disc_negcons"],
                ["ph_jobinterview","mh_jobinterview"],
                ["mh_career","mh_coworkers_perspective_neg"],
                ["mh_career","mh_sharewfriends"],
                ["mh_career","mhi_badresponse_work"],
                ["mh_career","observations_otherindividual_less_likely_reveal"],
                ["mh_diagnosed_medprof","sought_treatment"],
                ["mhi_treated_inferencework","mhi_NOTtreated_inferencework"]
            ]

for x, y in li_test_pairs:

    input = pd.crosstab(df_work[x], df_work[y], margins = False) 

    try:
        stat, p, dof, expected = stats.chi2_contingency(input)
    except:
        continue

    alpha = 0.05

    if p <= alpha:
        print(x + " & " + y + " are Dependent. p value =" + str(p))
    else:
        print(x + " & " + y + " are Independent. p value =" + str(p))

#drop columns that are dependent
li_drop_fields = ["emp_disc_mhissue_negcons", "emp_disc_mhdisorder_coworkers", "emp_disc_mhdisorder_supervisor", "emp_observed_negcons", "prevemp_ph_disc_negcons", "prevemp_disc_mhissues_coworkers", "prevemp_disc_mhissues_supervisor", "prevemp_negcons", "mh_jobinterview", "mh_coworkers_perspective_neg", "mh_sharewfriends", "mhi_badresponse_work", "observations_otherindividual_less_likely_reveal", "mhdisorder_what_diagnosed", "sought_treatment", "mhi_NOTtreated_inferencework", "country_live", "usstate_live", "usstate_work"]

for f in li_drop_fields:
    df_work.drop([f], axis=1, inplace=True)
    print (f + " deleted")

In [None]:
#cleaning age column

#print range of age column
print("range of age column")
print(df_work['age'].min())
print(df_work['age'].max())

#detect outliers in age column
sns.boxplot(x=df_work['age'])
plt.show()

#count number of outliers, excluding nan values
print("number of outliers in age column")
print("over 90 years:" + str(df_work.notna().loc[df_work['age'] > 90, 'age'].count()))
print("unter 15 years:" + str(df_work.notna().loc[df_work['age'] < 15, 'age'].count()))

#delete these outliers
df_work.drop(df_work[df_work['age'] > 90].index, inplace=True)
df_work.drop(df_work[df_work['age'] < 15].index, inplace=True)

df_work["age_cat"] = np.where(df_work["age"] > 64, "Older than 64", np.where(df_work["age"] > 55, "55-64", np.where(df_work["age"] > 25, "54-25", "0-24")))
df_work.drop(["age"], axis=1, inplace=True)

In [None]:
#cleaning gender column
df_work["gender"] = df_work["gender"].str.lower()
df_work["gender"] = df_work["gender"].str.strip()

li_g_male = ["cisdude", "cis male","m","man","maile", "Make", "Mal", "Cis Man","cis man", "man", "gender is male", "Male", "male", "ostensibly male, unsure what that really means", "male.", "M", "Malr", "malr", "Cis Male", "m", "mail","male (cis)", "msle", "Mail", "male ", "dude", "i'm a man why didn't you make this a drop down question. you should of asked sex? and i would of answered yes please. seriously how much text can this take?"]
li_g_female = ["fem", "cis female", "cis-female/femme", "i identify as female.", "F", "Woman", "female (props for making this a freeform field, though)", "female assigned at birth", "female", "woman", "femail", "Female", "Femake", "Female (cis)", "cis-woman", "cisgender female", "female/woman", "f", "female-bodied; no feelings about gender", "fm", " female", "cis female"]
li_g_diverse = ["non-binary", "agender", "human", "genderqueer", "nonbinary", "m|", "fluid", "male/genderqueer", "mtf", "unicorn", "male (trans, ftm)", "genderflux demi-girl", "afab", "queer", "enby", "genderqueer woman", "genderfluid", "none of your business", "sex is male", "nb masculine", "other", "male 9:1 female, roughly", "androgynous", "female or multi-gender femme", "other/transfeminine", "genderfluid (born female)", "transitioned, m2f", "bigender", "transgender woman"]

df_work["gender"] = df_work["gender"].replace(li_g_male, 'male')
df_work["gender"] = df_work["gender"].replace(li_g_female, 'female')
df_work["gender"] = df_work["gender"].replace(li_g_diverse, 'diverse')
df_work["gender"].fillna("diverse", inplace=True)

In [None]:
#count unique values in each value to identify columns with high variety 
li_uniques = []

for col in df_work.columns:
    li_uniques.append([col, df_work[col].nunique()])

df_uniques = pd.DataFrame(li_uniques, columns=["Col","NUnique"])

print(df_uniques[(df_uniques["NUnique"] > df_uniques["NUnique"].quantile(0.95))]["Col"].to_list())

#drop columns with high variety
df_work.drop(df_uniques[(df_uniques["NUnique"] > df_uniques["NUnique"].quantile(0.95))]["Col"].to_list(), axis=1, inplace=True)

In [None]:
# cleaning mhdisorder_what_diagnosed column
df_work["medprof_what_diagnosed"].fillna("No diagnosis", inplace=True)

df_work["medprof_what_diagnosed"] = df_work["medprof_what_diagnosed"].str.lower()
df_work["medprof_what_diagnosed"] = df_work["medprof_what_diagnosed"].str.strip()

li_f10 = ["substance use disorder","addictive disorder"]
li_f20 = ["schizotypal personality disorder", "psychotic disorder (schizophrenia, schizoaffective, etc)","schizotypal personality disorder 'autism spectrum disorder", "schizotypal personality disorder 'autism spectrum disorder'", "suicidal ideation"]
li_f30 = ["depression", "seasonal affective disorder", "seasonal affective disorder 'burn out", "mood disorder (depression, bipolar disorder, etc)"]
li_f40 = ["burn out", "obsessive-compulsive disorder", "post-traumatic stress disorder", "posttraumatic stress disourder", "stress response syndromes", "anxiety disorder (generalized, social, phobia, etc)", "dissociative disorder"]
li_f50 = ["eating disorder (anorexia, bulimia, etc)"]
li_f60 = ["gender dysphoria", "intimate disorder", "personality disorder (borderline, antisocial, paranoid, etc)", "gender identity disorder"]
li_f80 = ['"autism (asperger\'s)",', "autism (asperger's)", "asperger", "aspergers", "asperger syndrome", "attention deficit disorder (but not the hyperactive version)", "autism", 'autism - while not a "mental illness", still greatly affects how i handle anxiety', "autism spectrum disorder", "pdd-nos", "pdd-nos (see above)"]
li_f90 = ["mcd (when it was diagnosed, the ultra-mega \"disorder\" adhd didn\'t exist yet)", "attention deficit disorder", "attention deficit hyperactivity disorder", "add (w/o hyperactivity)"]

#replace and one hot encode
df_diag_long = df_work["medprof_what_diagnosed"].str.split("|").explode().replace(li_f10, "F10").replace(li_f20, "F20").replace(li_f30, "F30").replace(li_f40, "F40").replace(li_f50, "F50").replace(li_f60, "F60").replace(li_f80, "F80").replace(li_f90, "F90").to_frame()
df_work = pd.concat([df_work, pd.get_dummies(df_diag_long['medprof_what_diagnosed'],prefix='diag', prefix_sep='_')], axis=1)
df_work.drop(["medprof_what_diagnosed"], axis=1, inplace=True)

In [None]:
#clean up work_position column
df_work["work_position"] = df_work["work_position"].str.lower()
df_work["work_position"] = df_work["work_position"].str.strip()
df_work["work_position"].fillna("other", inplace=True)

# Define dictionary to map original values to new values
wp_dict = {'other': 'other', 'one-person shop': 'sole_proprietorship', 'hr': 'business_role', 'sales': 'business_role', 'support': 'business_role', 'back-end developer': 'developer', 'front-end developer': 'developer', 'executive leadership': 'leadership_role', 'supervisor/team lead': 'leadership_role', 'designer': 'non_dev_tech_role', 'dev evangelist/advocate': 'non_dev_tech_role', 'devops/sysadmin': 'non_dev_tech_role'}

# Map the values in the "work_position" column and one-hot encode
df_work["work_position"] = df_work["work_position"].map(wp_dict).str.split("|").explode()
df_work = pd.concat([df_work, pd.get_dummies(df_work['work_position'], prefix="wp", prefix_sep="_")], axis=1)
df_work.drop(["work_position"], axis=1, inplace=True)

In [None]:
#clean up mh_diagnosed_medprof column
df_work["mh_diagnosed_medprof"] = df_work["mh_diagnosed_medprof"].str.replace("Yes", "1").replace("No", "0")

In [None]:
#lower and strip all text in all columns
for col in df_work.columns:
    if df_work[col].dtype == "object":
        df_work[col] = df_work[col].str.lower()
        df_work[col] = df_work[col].str.strip()

In [None]:
#save dataframe
df_emp = df_work.copy()

In [None]:
#clean emp-dataframe
 
#drop columns with 100% nan values
li_nan = []
for col in df_emp.columns:
    if df_emp[col].isna().sum() == df_emp.shape[0]:
        li_nan.append(col)
df_emp.drop(li_nan, axis=1, inplace=True)

# fill nan values, selecting an appropriate value for each column
df_emp.isna().sum()
df_emp["emp_know_options"].fillna("i am not sure", inplace=True)
df_emp["prevemp_benefits"].fillna("i don't know", inplace=True)
df_emp["prevemp_know_options"].fillna("n/a (not currently aware)", inplace=True)
df_emp["prevemp_discussed"].fillna("i don't know", inplace=True)
df_emp["prevemp_ressources"].fillna(df_emp["prevemp_ressources"].mode()[0], inplace=True)
df_emp["prevemp_anonymity"].fillna("i don't know", inplace=True)
df_emp["prevemp_mh_disc_negcons"].fillna("i don't know", inplace=True)
df_emp["prevemp_mh_ph_serious"].fillna("i don't know", inplace=True)

df_emp.isna().sum()

In [None]:
#in emp-dataframe, set nominal columns to category type and set categories
df_emp["emp_benefits"] = df_emp["emp_benefits"].astype("category")
df_emp["emp_benefits"].cat.set_categories(["yes", "i don't know", "no", "not eligible for coverage / n/a"], inplace=True)

df_emp["emp_know_options"] = df_emp["emp_know_options"].astype("category")
df_emp["emp_know_options"].cat.set_categories(["yes", "no", "i am not sure"], inplace=True)

df_emp["emp_discussed"] = df_emp["emp_discussed"].astype("category")
df_emp["emp_discussed"].cat.set_categories(["yes", "no", "i don't know"], inplace=True)

df_emp["emp_ressources"] = df_emp["emp_ressources"].astype("category")
df_emp["emp_ressources"].cat.set_categories(["yes", "no", "i don't know"], inplace=True)

df_emp["emp_anonymity"] = df_emp["emp_anonymity"].astype("category")
df_emp["emp_anonymity"].cat.set_categories(["yes", "no", "i don't know"], inplace=True)

df_emp["emp_disc_mhdisorder_negcons"] = df_emp["emp_disc_mhdisorder_negcons"].astype("category")
df_emp["emp_disc_mhdisorder_negcons"].cat.set_categories(["yes", "no", "maybe"], inplace=True)

df_emp["emp_mh_ph_serious"] = df_emp["emp_mh_ph_serious"].astype("category")
df_emp["emp_mh_ph_serious"].cat.set_categories(["yes", "no", "i don't know"], inplace=True)

df_emp["ph_jobinterview"] = df_emp["ph_jobinterview"].astype("category")
df_emp["ph_jobinterview"].cat.set_categories(["yes", "no", "maybe"], inplace=True)

df_emp["family_history"] = df_emp["family_history"].astype("category")
df_emp["family_history"].cat.set_categories(["yes", "no", "i don't know"], inplace=True)

df_emp["mhdisorder_past"] = df_emp["mhdisorder_past"].astype("category")
df_emp["mhdisorder_past"].cat.set_categories(["yes", "no", "maybe"], inplace=True)

df_emp["mhdisorder_now"] = df_emp["mhdisorder_now"].astype("category")
df_emp["mhdisorder_now"].cat.set_categories(["yes", "no", "maybe"], inplace=True)

df_emp["gender"] = df_emp["gender"].astype("category")
df_emp["gender"].cat.set_categories(["male", "female", "diverse"], inplace=True)

df_emp["country_work"] = df_emp["country_work"].astype("category")


#in emp-dataframe, set ordinal columns to category type and set categories
df_emp["num_employees"] = df_emp["num_employees"].astype(CategoricalDtype(categories=["1-5","6-25","26-100","100-500","500-1000", "more than 1000"], ordered=True))

df_emp["emp_medicalleave"] = df_emp["emp_medicalleave"].astype(CategoricalDtype(categories=["very easy","somewhat easy","neither easy nor difficult","somewhat difficult","very difficult","i don't know"], ordered=True))

df_emp["prevemp_benefits"] = df_emp["prevemp_benefits"].astype(CategoricalDtype(categories=["yes, they all did","some did","no, none did","i don't know"], ordered=True))

df_emp["prevemp_know_options"] = df_emp["prevemp_know_options"].astype(CategoricalDtype(categories=["yes, i was aware of all of them","i was aware of some","no, i only became aware later","n/a (not currently aware)"], ordered=True))

df_emp["prevemp_discussed"] = df_emp["prevemp_discussed"].astype(CategoricalDtype(categories=["yes, they all did","some did","none did","i don't know"], ordered=True))

df_emp["prevemp_ressources"] = df_emp["prevemp_ressources"].astype(CategoricalDtype(categories=["yes, they all did","some did","none did"], ordered=True))

df_emp["prevemp_anonymity"] = df_emp["prevemp_anonymity"].astype(CategoricalDtype(categories=["yes, always","sometimes","no","i don't know"], ordered=True))

df_emp["prevemp_mh_disc_negcons"] = df_emp["prevemp_mh_disc_negcons"].astype(CategoricalDtype(categories=["yes, all of them","some of them","none of them","i don't know"], ordered=True))

df_emp["prevemp_mh_ph_serious"] = df_emp["prevemp_mh_ph_serious"].astype(CategoricalDtype(categories=["yes, they all did","some did","none did","i don't know"], ordered=True))

df_emp["mh_career"] = df_emp["mh_career"].astype(CategoricalDtype(categories=["yes, it has","yes, i think it would","maybe","no, i don't think it would","no, it has not"], ordered=True))

df_emp["mhi_treated_inferencework"] = df_emp["mhi_treated_inferencework"].astype(CategoricalDtype(categories=["often","sometimes","never","rarely","not applicable to me"], ordered=True))

df_emp["remotework"] = df_emp["remotework"].astype(CategoricalDtype(categories=["always","sometimes","never"], ordered=True))

df_emp["age_cat"] = df_emp["age_cat"].astype(CategoricalDtype(categories=["0-24","54-25","55-64","older than 64"], ordered=True))


#set mh_diagnosed_medprof to int64
df_emp["mh_diagnosed_medprof"] = df_emp["mh_diagnosed_medprof"].astype("int64")

#df_emp dtypes
print(df_emp.dtypes)
print(df_emp.isna().sum())

**After the data is cleaned, we will now start encoding**

In [None]:
dict_encoders = {}
li_ordered = []
li_not_ordered = []

#encode ordinal columns
for col in df_emp.columns:
    if df_emp[col].dtype.name == "category":
        if df_emp[col].cat.ordered:
            print(col + " ordered")
            li_ordered.append(col)
            dict_encoders[col] = OrdinalEncoder()
            df_emp[col] = dict_encoders[col].fit_transform(df_emp[[col]])
        else:
            print(col + " not ordered")
            li_not_ordered.append(col)
            dict_encoders[col] = OrdinalEncoder()
            df_emp[col] = dict_encoders[col].fit_transform(df_emp[[col]])

#set all columns to int64
df_emp = df_emp.astype("int64")
df_emp.info()

**now, start clustering**

In [None]:
#investigate PCA on df_emp data
pca = PCA()
pca.fit(df_emp)
print(pca.explained_variance_ratio_)

plt.figure(figsize=(10,10))
plt.plot(range(1,45), pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.savefig("PCA.png")

In [None]:
#conduct PCA on df_emp data
pca = PCA(n_components=3, random_state=0)
pca.fit(df_emp)
pca_data_emp = pca.transform(df_emp)
df_pca_emp = pd.DataFrame(pca_data_emp, columns=["pca1", "pca2", "pca3"])
print(df_pca_emp.head())

#plot pca data in 3d
plt.figure(figsize=(10,10))
ax = plt.axes(projection="3d")
ax.scatter3D(df_pca_emp["pca1"], df_pca_emp["pca2"], df_pca_emp["pca3"])
ax.set_xlabel("pca1")
ax.set_ylabel("pca2")
ax.set_zlabel("pca3")
plt.savefig("PCA_3d.png")
plt.show()

In [None]:
# plot scores for different number of clusters
silhouette_scores = []
davies_bouldin_scores = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=0)
    kmeans.fit(df_emp)
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(df_emp, labels, metric='euclidean'))
    davies_bouldin_scores.append(davies_bouldin_score(df_emp, labels))
plt.figure(figsize=(10,10))
plt.plot(range(2, 11), silhouette_scores, label="Silhouette score")
plt.plot(range(2, 11), davies_bouldin_scores, label="Davies-Bouldin score")
plt.xlabel("Number of clusters")
plt.ylabel("Score")
plt.legend()
plt.show()

#show best number of clusters
print(silhouette_scores)
print(davies_bouldin_scores)

fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=0)
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df_emp)

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=0)
    kmeans.fit(df_emp)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10,10))
plt.plot(range(1, 11), wcss, marker="o", linestyle="--")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()


In [None]:
# plot scores for different number of clusters using pca data
silhouette_scores = []
davies_bouldin_scores = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=0)
    kmeans.fit(df_pca_emp)
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(df_pca_emp, labels, metric='euclidean'))
    davies_bouldin_scores.append(davies_bouldin_score(df_pca_emp, labels))
plt.figure(figsize=(10,10))
plt.plot(range(2, 11), silhouette_scores, label="Silhouette score")
plt.plot(range(2, 11), davies_bouldin_scores, label="Davies-Bouldin score")
plt.xlabel("Number of clusters")
plt.ylabel("Score")
plt.legend()
plt.show()

#show best number of clusters
print(silhouette_scores)
print(davies_bouldin_scores)

fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=0)
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df_pca_emp)      # Fit the data to the visualizer

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", random_state=0)
    kmeans.fit(df_pca_emp)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10,10))
plt.plot(range(1, 11), wcss, marker="o", linestyle="--")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
#cluster data using kmeans, using optimal number of clusters
kmeans = KMeans(n_clusters=2, init="k-means++", random_state=0)
kmeans.fit(pca_data_emp)
emp_labels = kmeans.labels_
print(emp_labels)

#silhouette score
print(silhouette_score(df_pca_emp, emp_labels, metric='euclidean'))

#put results into a dataframe
df_emp_C = pd.concat([df_emp.reset_index(drop=True), df_pca_emp.reset_index(drop=True)], axis=1)
df_emp_C.columns.values[-3:] = ["pca1", "pca2", "pca3"]
df_emp_C['Cluster'] = emp_labels
df_emp_C['Cluster'].replace({0:'Cluster 1', 1:'Cluster 2'}, inplace=True)
df_emp_C.head()

#drop trained models to save space
del pca
del kmeans

In [None]:
#try agglomerative clustering on pca data
agg = AgglomerativeClustering(n_clusters=2)
labels = agg.fit_predict(df_pca_emp)

#silhouette score
print(silhouette_score(df_pca_emp, labels, metric='euclidean'))

#put results into a dataframe
df_emp_CAGG = pd.concat([df_emp.reset_index(drop=True), df_pca_emp.reset_index(drop=True)], axis=1)
df_emp_CAGG.columns.values[-3:] = ["pca1", "pca2", "pca3"]
df_emp_CAGG['Cluster'] = labels
df_emp_CAGG['Cluster'].replace({0:'Cluster 1', 1:'Cluster 2'}, inplace=True)

#drop trained models to save space
del agg

In [None]:
#try gaussian mixture model on pca data
gmm = mixture.GaussianMixture(n_components=2, covariance_type='full')
gmm.fit(df_pca_emp)
labels = gmm.predict(df_pca_emp)

#silhouette score
print(silhouette_score(df_pca_emp, labels, metric='euclidean'))

#put results into a dataframe
df_emp_CGMM = pd.concat([df_emp.reset_index(drop=True), df_pca_emp.reset_index(drop=True)], axis=1)
df_emp_CGMM.columns.values[-3:] = ["pca1", "pca2", "pca3"]
df_emp_CGMM['Cluster'] = labels
df_emp_CGMM['Cluster'].replace({0:'Cluster 1', 1:'Cluster 2'}, inplace=True)

#drop trained models to save space
del gmm

In [None]:
#visualize df_emp_CGMM, df_emp_CAGG, df_emp_C in one plot
fig, ax = plt.subplots(1, 3, figsize=(20,10))
sns.scatterplot(x="pca1", y="pca2", data=df_emp_C, hue="Cluster", hue_order=["Cluster 1", "Cluster 2"], palette=["red", "green"], ax=ax[0])
sns.scatterplot(x="pca1", y="pca2", data=df_emp_CAGG, hue="Cluster",hue_order=["Cluster 1", "Cluster 2"], palette=["red", "green"], ax=ax[1])
sns.scatterplot(x="pca1", y="pca2", data=df_emp_CGMM, hue="Cluster",hue_order=["Cluster 1", "Cluster 2"], palette=["red", "green"], ax=ax[2])
plt.savefig("plots/cluster_comparison.png")
plt.show()


**start interpreting the results**

In [None]:
#compare cluster sizes
print(df_emp_C['Cluster'].value_counts())

#pie chart of cluster sizes
fig, ax = plt.subplots(figsize=(10,10))
df_emp_C['Cluster'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax)
plt.savefig('Plots/emp_cluster_pie.png')
plt.show()

In [None]:
#sepreate clusters
df_emp_C1 = df_emp_C[df_emp_C['Cluster'] == 'Cluster 1'].drop(columns=['Cluster', 'pca1', 'pca2', 'pca3'])
df_emp_C2 = df_emp_C[df_emp_C['Cluster'] == 'Cluster 2'].drop(columns=['Cluster', 'pca1', 'pca2', 'pca3'])

In [None]:
df_polar=df_emp_C.groupby("Cluster").mean().reset_index()
df_polar=pd.melt(df_polar,id_vars=["Cluster"])
fig4 = px.line_polar(df_polar, r="value", theta="variable", color="Cluster", line_close=True,height=600,width=1000)
fig4.show()

In [None]:
#C1, value counts for each row
df_emp_C1.apply(pd.Series.value_counts).fillna(0)
df_emp_C1.apply(pd.Series.value_counts).fillna(0).T.apply(lambda x: x/x.sum(), axis=1)

#plot value counts
df_emp_C1.apply(pd.Series.value_counts).fillna(0).T.apply(lambda x: x/x.sum(), axis=1).plot(kind='barh', stacked=True, figsize=(20,10))
plt.savefig('Plots/emp_cluster1.png')

In [None]:
#C2, value counts for each row
df_emp_C2.apply(pd.Series.value_counts).fillna(0)
df_emp_C2.apply(pd.Series.value_counts).fillna(0).T.apply(lambda x: x/x.sum(), axis=1)

#plot value counts
df_emp_C2.apply(pd.Series.value_counts).fillna(0).T.apply(lambda x: x/x.sum(), axis=1).plot(kind='barh', stacked=True, figsize=(20,10))
plt.savefig('Plots/emp_cluster2.png')

In [None]:
#df_emp_C value counts for each row grouped by cluster
df_emp_C_comp = df_emp_C.drop(columns=['pca1', 'pca2', 'pca3']).groupby('Cluster').apply(lambda x: x.apply(pd.Series.value_counts).fillna(0).T).drop(columns=['Cluster 1', 'Cluster 2']).apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).reset_index()
df_emp_C_comp.rename(columns={'level_1':'variable'}, inplace=True)
df_emp_C_comp = df_emp_C_comp.sort_values(by=['variable'])
df_emp_C_comp

In [None]:
#for each variable, plot value counts for each row in bar chart horizontally stacked
for var in df_emp_C_comp['variable'].unique():
    df_emp_C_comp[df_emp_C_comp['variable'] == var].drop(columns=['variable']).set_index('Cluster').plot(kind='barh', stacked=True, figsize=(20,10))
    plt.title(var)
    plt.savefig('Plots/emp_' + var + '.png')

**Start asking questions on the data**

In [None]:
#recode categorical variables

for d in dict_encoders:
    df_emp[d] = dict_encoders[d].inverse_transform(df_emp[[d]])

In [None]:
df_emp.columns

In [None]:
#percentage 'mhdisorder_now' in 'wp_business_role', 'wp_developer', 'wp_leadership_role', 'wp_non_dev_tech_role', 'wp_other', 'wp_sole_proprietorship'
print("Current mental health issue in Business Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_business_role')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Business Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_business_role')[['mhdisorder_past']].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")

print("Current mental health issue in Developer Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_developer')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Developer Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_developer')['mhdisorder_past'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")

print("Current mental health issue in Leadership Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_leadership_role')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Leadership Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_leadership_role')['mhdisorder_past'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")

print("Current mental health issue in Non-Dev Tech Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_non_dev_tech_role')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Non-Dev Tech Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_non_dev_tech_role')['mhdisorder_past'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")

print("Current mental health issue in Other Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_other')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Other Roles")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_other')['mhdisorder_past'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")

print("Current mental health issue in Sole Proprietorship")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_sole_proprietorship')['mhdisorder_now'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")
print("Past mental health issue in Sole Proprietorship")
print(df_emp[df_emp["tech_comp"] == 1.0].groupby('wp_sole_proprietorship')['mhdisorder_past'].value_counts().unstack().apply(lambda x: x/x.sum()*100, axis=1).round(2).fillna(0).astype(str) + "%")


In [None]:
#Percentage of persons with current and past mental health disorder only in the tech companies.
print(df_emp[df_emp["tech_comp"] == 1.0]["mhdisorder_now"].value_counts(normalize=True).mul(100).round(2).astype(str) + "%")

print(df_emp[df_emp["tech_comp"] == 1.0]["mhdisorder_past"].value_counts(normalize=True).mul(100).round(2).astype(str) + "%")

In [None]:
#Percentage of persons with current and past mental health disorder only in the tech companies by gender
df_tech = df_emp[df_emp["tech_comp"] == 1.0]
print(df_tech[["mhdisorder_now", "gender"]].groupby(by="gender").value_counts(normalize=True).mul(100).round(2))
print(df_tech[["mhdisorder_past", "gender"]].groupby(by="gender").value_counts(normalize=True).mul(100).round(2))


In [None]:
#distribution of 'diag_F10', 'diag_F20', 'diag_F30', 'diag_F40', 'diag_F50', 'diag_F60', 'diag_F80', 'diag_F90', 'diag_no diagnosis' in tech companies by gender

for diag in ['diag_F10', 'diag_F20', 'diag_F30', 'diag_F40', 'diag_F50', 'diag_F60', 'diag_F80', 'diag_F90', 'diag_no diagnosis']:
    print(df_emp[[diag,'tech_comp','gender']][(df_emp[diag] == 1) & (df_emp['tech_comp'] == 1)].drop('tech_comp', axis=1).groupby(by='gender').count())


In [None]:
#supervised learning, target label mhdisorder_now

#fetch new dataframe
df_tech = df_emp[df_emp["tech_comp"] == 1.0]

#ordinal encoding
for col in df_tech.columns:
    df_tech[col] = OrdinalEncoder().fit_transform(df_tech[[col]])

#drop irrelevant columns
df_tech = df_tech.drop(['diag_F10', 'diag_F20', 'diag_F30', 'diag_F40', 'diag_F50', 'diag_F60', 'diag_F80', 'diag_F90', 'diag_no diagnosis', 'tech_comp', 'prev_emp', 'mh_diagnosed_medprof', ], axis=1)

#creating the feature and target variables
y = df_tech['mhdisorder_now']
x = df_tech.drop(['mhdisorder_now', 'mhdisorder_past'], axis=1)

#splitting the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#fitting the model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

#predicting the test set
y_pred = model.predict(x_test)

#evaluating the model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

#confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

#feature importance
feature_importance = abs(model.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure(figsize=(10, 10))

feats = x.columns
feats = feats[sorted_idx]
feature_importance = feature_importance[sorted_idx]
plt.barh(pos, feature_importance, align='center')
plt.yticks(pos, feats)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.savefig('Plots/feature_importance.png', bbox_inches='tight')
plt.show()

#make dataframe of feature importance
df_feat_imp = pd.DataFrame({'feature': x.columns, 'importance': feature_importance})
df_feat_imp = df_feat_imp.sort_values(by='importance', ascending=False)
print(df_feat_imp)