# Data Exploration & Insights

In [1]:
# preliminary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# name data folder and files
data_folder = './data/'
data_file = 'OECD_Mental_Health_Stats.csv'

In [3]:
# import data to pandas dataframe
mental_health_data = pd.read_csv(data_folder + data_file)

## Data Cleaning & Preparation

In [4]:
# Clean and Prepare Data

# Drop redundant columns that depend on another column
# Also drop 'Measure', 'Flag Codes', and 'Flags' columns because they are not needed in our data exploration
mental_health_data = mental_health_data.drop(['ISC11A', 'COUNTRY', 'MEASURE', 'Reference year', 'SEX', 'PIAAC_CATEGORY', 'INDICATOR', 'AGE', 'Measure', 'Flag Codes', 'Flags'], axis=1)

# Drop rows where Value is NaN, and drop columns Measure, Flag Codes, and Flags (because they indicate that value is NaN)
mental_health_data = mental_health_data.dropna(subset = ['Value'])

# Rename some columns for easier understanding
mental_health_data = mental_health_data.rename(columns={'ISCED-A 2011': 'Education', 'Category': 'Employment', 'YEAR': 'Year'})

# Convert categorical columns to type category
cat_cols = ['Country', 'Education', 'Gender', 'Age', 'Employment', 'Indicator']
mental_health_data[cat_cols] = mental_health_data[cat_cols].astype('category')

# View current dataset
mental_health_data.head()

Unnamed: 0,Country,Education,Gender,Age,Employment,Indicator,Year,Value
0,Austria,Below upper secondary education,Total,25-44 years,Employed,Percentage of adults who report having depression,2014,6.5209
1,Austria,Below upper secondary education,Women,25-44 years,Employed,Percentage of adults who report having depression,2014,8.8432
2,Austria,Below upper secondary education,Men,25-44 years,Employed,Percentage of adults who report having depression,2014,2.7627
3,Austria,Below upper secondary education,Total,25-44 years,Total,Percentage of adults who report having depression,2014,12.8613
4,Austria,Below upper secondary education,Women,25-44 years,Total,Percentage of adults who report having depression,2014,12.2516


In [5]:
mental_health_data.shape

(8428, 8)

## Data Exploration

In [6]:
# Create 3 separate dataframes for the 3 indicators in the dataset
depressed = mental_health_data[mental_health_data.Indicator == 'Percentage of adults who report having depression']
health_numeracy = mental_health_data[mental_health_data.Indicator == 'Percentage of adults reporting that they are in good health, by numeracy proficiency level ']
health_literacy = mental_health_data[mental_health_data.Indicator == 'Percentage of adults reporting that they are in good health, by literacy proficiency level ']

# Creat 1 dataframe that combines health_numeracy and health_literacy
health_all = mental_health_data[(mental_health_data.Indicator == 'Percentage of adults reporting that they are in good health, by numeracy proficiency level ') | (mental_health_data.Indicator == 'Percentage of adults reporting that they are in good health, by literacy proficiency level ')]


In [8]:
depressed.head()

Unnamed: 0,Country,Education,Gender,Age,Employment,Indicator,Year,Value
0,Austria,Below upper secondary education,Total,25-44 years,Employed,Percentage of adults who report having depression,2014,6.5209
1,Austria,Below upper secondary education,Women,25-44 years,Employed,Percentage of adults who report having depression,2014,8.8432
2,Austria,Below upper secondary education,Men,25-44 years,Employed,Percentage of adults who report having depression,2014,2.7627
3,Austria,Below upper secondary education,Total,25-44 years,Total,Percentage of adults who report having depression,2014,12.8613
4,Austria,Below upper secondary education,Women,25-44 years,Total,Percentage of adults who report having depression,2014,12.2516


In [9]:
health_all.head()

Unnamed: 0,Country,Education,Gender,Age,Employment,Indicator,Year,Value
5832,Israel,All levels of education,Men,25-64 years,Level (0/1),Percentage of adults reporting that they are i...,2015,67.0
5833,England,All levels of education,Women,25-64 years,Level (4/5),Percentage of adults reporting that they are i...,2012,91.0
5834,Estonia,Upper secondary and post-secondary non-tertiar...,Women,25-64 years,Level (0/1),Percentage of adults reporting that they are i...,2012,41.0
5835,Chile,All levels of education,Total,25-64 years,Level (4/5),Percentage of adults reporting that they are i...,2015,7.0
5836,Japan,All levels of education,Total,25-64 years,Level (4/5),Percentage of adults reporting that they are i...,2012,77.0


### Depression Dataset - 2014

In [11]:
depressed.groupby(['Education'])['Value'].mean()

Education
All levels of education                                      6.527361
Below upper secondary education                              9.729693
Tertiary education                                           4.875315
Upper secondary and post-secondary non-tertiary education    6.977213
Name: Value, dtype: float64

Above shows a trend that lower levels of education is correlated with higher depression percentages. Let's explore this a bit further by also grouping by employment.

In [31]:
depressed.groupby(['Employment'])['Value'].mean()

Employment
Active         6.734838
Employed       5.816607
Level (0/1)         NaN
Level (2)           NaN
Level (3)           NaN
Level (4/5)         NaN
Total          8.445505
Name: Value, dtype: float64

In [12]:
depressed.groupby(['Education', 'Employment'])['Value'].mean()

Education                                                  Employment
All levels of education                                    Active         6.248466
                                                           Employed       5.427080
                                                           Total          7.906535
Below upper secondary education                            Active         9.222765
                                                           Employed       7.528450
                                                           Total         12.283211
Tertiary education                                         Active         4.786342
                                                           Employed       4.364011
                                                           Total          5.473121
Upper secondary and post-secondary non-tertiary education  Active         6.765906
                                                           Employed       6.046579
                 

In all levels of education, employed adults has lower depression levels than active job seeking adults. 

In [15]:
depressed.groupby(['Country'])['Value'].mean()

Country
Australia                                       NaN
Austria                                    7.370024
Belgium                                    5.760428
Canada                                          NaN
Chile                                           NaN
Czech Republic                             3.129252
Denmark                                    7.779905
England                                         NaN
Estonia                                    4.390859
Finland                                    9.027674
Flanders                                        NaN
France                                     5.699592
Germany                                   11.246651
Greece                                     3.015814
Hungary                                    3.982471
Iceland                                   12.651997
Ireland                                   12.697508
Israel                                          NaN
Italy                                      2.890819
Japa

**Only data for France, none for any other country we care about so let's use health_all dataset for country segmentation**

In [16]:
depressed.groupby(['Year'])['Value'].mean()

Year
2014    7.006444
Name: Value, dtype: float64

### Health Datasets - 2012 & 2015

In [17]:
health_numeracy.groupby(['Employment'])['Value'].mean()

Employment
Active               NaN
Employed             NaN
Level (0/1)    36.149727
Level (2)      40.282245
Level (3)      42.976986
Level (4/5)    46.157749
Total                NaN
Name: Value, dtype: float64

In [18]:
health_literacy.groupby(['Employment'])['Value'].mean()

Employment
Active               NaN
Employed             NaN
Level (0/1)    36.154132
Level (2)      39.847786
Level (3)      42.644595
Level (4/5)    46.274157
Total                NaN
Name: Value, dtype: float64

In [19]:
health_all.groupby(['Employment'])['Value'].mean()

Employment
Active               NaN
Employed             NaN
Level (0/1)    36.151920
Level (2)      40.064733
Level (3)      42.809660
Level (4/5)    46.215520
Total                NaN
Name: Value, dtype: float64

In [29]:
health_all.loc[health_all['Country'].isin(['Australia', 'France', 'United States'])].groupby(['Country','Employment'])['Value'].mean()

Country        Employment 
Australia      Level (0/1)    39.310417
               Level (2)      43.141667
               Level (3)      44.537500
               Level (4/5)    46.147619
France         Level (0/1)    36.112500
               Level (2)      40.791667
               Level (3)      43.558333
               Level (4/5)    46.602941
United States  Level (0/1)    37.022917
               Level (2)      42.627083
               Level (3)      46.002632
               Level (4/5)    48.391176
Name: Value, dtype: float64

From the above statistics, it is obvious that there is correlation between the numerical and literacy proficiency level and the health rate. Higher levels of both (or either one of) literacy and numerical proficiency is correlated with higher good-health reports. 

In [22]:
health_all[health_all.Year == 2012].groupby(['Country'])['Value'].mean()

Country
Australia                                 43.191935
Austria                                   41.990698
Belgium                                         NaN
Canada                                    44.474444
Chile                                           NaN
Czech Republic                            45.133333
Denmark                                   41.170330
England                                   43.361111
Estonia                                   30.802778
Finland                                   39.826136
Flanders                                  43.063793
France                                    41.385955
Germany                                   44.522093
Greece                                          NaN
Hungary                                         NaN
Iceland                                         NaN
Ireland                                   45.146629
Israel                                          NaN
Italy                                     43.879775
Japa

**Countries Average Reports of Good Health Condition in 2012**

Of the countries we are interested in, we have their statistics on % of population who reported themselves in good health condition:
* Australia: 43.1919
* China: N/A
* France: 41.385955
* Russia: N/A
* United States: 42.955952

Note we used 2012 instead of 2015 because 2015 did not have any data for the above countries