In [70]:
# import libraries
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import lightgbm as ltb
import xgboost as xgb
import catboost as catb
import matplotlib.pyplot as plt

In [71]:
# import dataset
df = pd.read_csv("./combined_surveys_2016-2021/mental_health_data_2016-2021.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,age,sex,self-employed,working_country,living_country,mental_illness_family_history,have_mental_illness,mental_disorder_in_the_past,number_of_employees_your_company_has,mental_health_benefits_from_previous_employers,mental_illness_diagnosis,mental_health_benefits_from_current_employer,discussing_mental_health_with_coworkers,sought_mental_treatment,discussing_mental_health_with_supervisor,mental_illness_medical_coverage
0,0,39.0,Male,0,United Kingdom,United Kingdom,No,No,Yes,26-100,"No, none did",Yes,Not eligible for coverage / NA,Maybe,0,Yes,
1,1,29.0,male,0,United States of America,United States of America,Yes,Yes,Yes,0-25,"Yes, they all did",Yes,No,Maybe,1,Yes,
2,2,38.0,Male,0,United Kingdom,United Kingdom,No,No,Maybe,0-25,"No, none did",No,No,Maybe,1,Maybe,


## Grouping

In [72]:
# Grouping Countries (for working_country & living_country columns)
europe_rest = [ 'Latvia', 'Lithuania', 'Estonia',   'Austria', 'Switzerland', 'Germany']

scandinavia = ['Denmark', 'Norway', 'Finland', 'Iceland','Sweden']

east_europe = ['Belarus', 'Bulgaria', 'Czech Republic', 'Hungary', 'Poland', 'Ukraine', 'Slovakia', 'Romania']

west_europe = ['Belgium', 'France', 'Ireland', 'Netherlands', 'Luxembourg', 'United Kingdom']

south_europe = ['Albania', 'Bosnia and Herzegovina', 'Greece', 'Croatia', 'Italy', 
                'Portugal', 'Macedonia', 'Serbia', 'Slovenia', 'Spain', 'Turkey', 'Georgia']

asia_subregion = [ 'Singapore', 'Malaysia', 'Brunei', 'Vietnam', 'Philippines', 'Indonesia']

north_east_asia = ['China', 'Japan', 'Russia', 'Mongolia', 'Taiwan', 'Hong Kong']

south_asia = ['Pakistan', 'Sri Lanka', 'India', 'Bangladesh', 'Afghanistan',]

west_asia_middle_east = ['Saudi Arabia', 'Iran', 'Egypt', 'Israel', 'Jordan', 'United Arab Emirates',]

north_america = ['Mexico', 'Canada', 'United States of America']

south_america = ['Ecuador', 'Guatemala', 'Uruguay', 'Costa Rica', 'Venezuela', 'Kenya', 
                 'Nigeria', 'Argentina', 'Chile', 'Colombia', 'Brazil']

africa = ['Eritrea', 'Swaziland', 'Mauritius', 'Ethiopia', 'Ghana', 
          'Botswana', 'Algeria', 'Cameroon', 'Kenya', 'South Africa', 'Sao Tome and Principe']

australia = ['New Zealand', 'Australia']

In [73]:
pd.set_option("display.max_rows", None)
df['working_country'].value_counts(ascending=True)

Iran                           1
Ecuador                        1
Guatemala                      1
Jordan                         1
Eritrea                        1
Swaziland                      1
Singapore                      1
Luxembourg                     1
Latvia                         1
Mauritius                      1
Saudi Arabia                   1
Uruguay                        1
Ethiopia                       1
Georgia                        1
Ghana                          1
Botswana                       1
Algeria                        1
Cameroon                       1
Taiwan                         1
Egypt                          1
Malaysia                       1
Belarus                        1
Brunei                         1
Costa Rica                     1
Albania                        1
Venezuela                      1
Lithuania                      1
United Arab Emirates           1
Kenya                          2
Nigeria                        2
Mongolia  

In [74]:
country_columns = ['working_country', 'living_country']
for column_name in country_columns:
    for value in df[column_name]:
        if value in europe_rest:
            df[column_name].replace(value, 'europe_rest', inplace=True)
        elif value in scandinavia:
            df[column_name].replace(value, 'scandinavia', inplace=True)
        elif value in east_europe:
            df[column_name].replace(value, 'east_europe', inplace=True)
        elif value in west_europe:
            df[column_name].replace(value, 'west_europe', inplace=True)
        elif value in south_europe:
            df[column_name].replace(value, 'south_europe', inplace=True)   
        elif value in asia_subregion:
            df[column_name].replace(value, 'asia_subregion', inplace=True)
        elif value in north_east_asia:
            df[column_name].replace(value, 'north_east_asia', inplace=True)
        elif value in south_asia:
            df[column_name].replace(value, 'south_asia', inplace=True)
        elif value in west_asia_middle_east:
            df[column_name].replace(value, 'west_asia_middle_east', inplace=True)       
        elif value in north_america:
            df[column_name].replace(value, 'north_america', inplace=True)
        elif value in south_america:
            df[column_name].replace(value, 'south_america', inplace=True)
        elif value in africa:
            df[column_name].replace(value, 'africa', inplace=True)
        elif value in australia:
            df[column_name].replace(value, 'australia', inplace=True)

In [75]:
print(df['working_country'].value_counts(ascending=True))
print(df['living_country'].value_counts(ascending=True))

Other                       3
west_asia_middle_east      11
asia_subregion             15
africa                     21
north_east_asia            28
east_europe                45
scandinavia                58
south_america              66
australia                  73
south_europe              104
south_asia                110
europe_rest               143
west_europe               457
north_america            2133
Name: working_country, dtype: int64
Other                       2
west_asia_middle_east      10
asia_subregion             15
africa                     20
north_east_asia            32
east_europe                51
scandinavia                56
south_america              69
australia                  74
south_europe              106
south_asia                113
europe_rest               142
west_europe               461
north_america            2116
Name: living_country, dtype: int64


In [76]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,self-employed,working_country,living_country,mental_illness_family_history,have_mental_illness,mental_disorder_in_the_past,number_of_employees_your_company_has,mental_health_benefits_from_previous_employers,mental_illness_diagnosis,mental_health_benefits_from_current_employer,discussing_mental_health_with_coworkers,sought_mental_treatment,discussing_mental_health_with_supervisor,mental_illness_medical_coverage
0,0,39.0,Male,0,west_europe,west_europe,No,No,Yes,26-100,"No, none did",Yes,Not eligible for coverage / NA,Maybe,0,Yes,
1,1,29.0,male,0,north_america,north_america,Yes,Yes,Yes,0-25,"Yes, they all did",Yes,No,Maybe,1,Yes,
2,2,38.0,Male,0,west_europe,west_europe,No,No,Maybe,0-25,"No, none did",No,No,Maybe,1,Maybe,
3,3,43.0,male,1,west_europe,west_europe,No,Yes,Yes,,Some did,Yes,,,1,,1.0
4,4,43.0,Female,0,north_america,north_america,Yes,Yes,Yes,0-25,I don't know,Yes,Yes,Maybe,1,No,


In [77]:
# Grouping age column and creating age-range column

for value in df['age']:
    if value < 0:
        pass
    elif value <= 10:
        df['age'].replace(value, 'child', inplace=True)
    elif value <= 20:
        df['age'].replace(value, 'teenager', inplace=True)
    elif value <= 30:
        df['age'].replace(value, 'adult', inplace=True)
    elif value <= 50:
        df['age'].replace(value, 'middle-aged', inplace=True)
    elif value <= 60:
        df['age'].replace(value, 'old', inplace=True)
    elif value > 60:
        df['age'].replace(value, 'very_old', inplace=True)

In [78]:
df['age'].value_counts()

middle-aged    1912
adult          1162
old             124
teenager         34
very_old         31
child             4
Name: age, dtype: int64