# Predicting Happiness 

## Models 

### 1. Imports and dataset:

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score,recall_score,precision_score,f1_score
from sklearn.tree import DecisionTreeClassifier

In [5]:
# Reading the clean and dummied dataset in
colstep = pd.read_csv('./Datasets /STEPColombia.csv')

In [6]:
# Confirming this is the correct dataset 
colstep.head()

Unnamed: 0,in_school,owns_house,house_beds,house_kitchen,reported_social_status,got_pr_transf,part_in_training,life_satisfaction,offdays_ill,healthinsurance,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
0,0.0,2,3,1,3,0,0.0,9.0,0.0,1.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,2,1,1,3,0,1.0,9.0,0.0,1.0,...,1,1,0,0,0,1,0,0,0,0
2,0.0,2,2,1,3,0,0.0,5.0,0.0,1.0,...,0,0,1,0,0,0,0,0,0,1
3,0.0,2,2,1,3,0,0.0,7.0,0.0,1.0,...,0,0,0,0,1,0,0,0,0,1
4,0.0,1,1,1,3,0,0.0,7.0,0.0,1.0,...,1,1,0,0,1,0,0,0,0,0


In [7]:
colstep.shape

(2617, 114)

### 2. Identifying X and y 

In [8]:
y = colstep['life_satisfaction']

In [9]:
# Looking at the columns to identify those that have a dummy so that I can exclude 
# them from X
colstep.columns[:55]

Index(['in_school', 'owns_house', 'house_beds', 'house_kitchen',
       'reported_social_status', 'got_pr_transf', 'part_in_training',
       'life_satisfaction', 'offdays_ill', 'healthinsurance',
       'speak_other_languaje', 'lives_w_mother', 'lives_w_father', 'ses_15',
       'read_overall', 'write_overall', 'numeracy_overall',
       'computer_use_overall', 'think_learn_work', 'autonomy_at_work',
       'repetitiveness_at_work', 'physical_demand_work', 'extraversion_av',
       'conscientiousness_avg', 'openness_av', 'stability_av',
       'agreeableness_av', 'grit_av', 'decision_av', 'hostile_av', 'risk',
       'has_children', 'hh_size', 'gender', 'age', 'has_spouse',
       'chronic_disease', 'shocks_bef_15', 'BMI', 'mother_tongue',
       'labor_market_status', 'job_stable', 'hour_earnings',
       'total_hr_worked_week', 'highest_ISCED_PIAAC', 'years_educ',
       'wealth_index', 'overqualified', 'country', 'dropout', 'got_pu_transf',
       'supervise', 'in_school_1.0', 'own

In [10]:
# Setting the X values. Excluding life satisfaction and other features that have dummies. 
X = colstep.drop(columns=['life_satisfaction','in_school', 'owns_house', 'house_beds',
                          'house_kitchen','reported_social_status', 'got_pr_transf', 
                          'got_pu_transf', 'part_in_training','life_satisfaction', 
                          'offdays_ill', 'healthinsurance','speak_other_languaje', 
                          'lives_w_mother', 'lives_w_father','read_overall', 
                          'write_overall', 'numeracy_overall', 'supervise', 
                          'computer_use_overall', 'think_learn_work', 'autonomy_at_work',
                          'repetitiveness_at_work', 'physical_demand_work', 
                          'has_children', 'hh_size', 'gender', 'has_spouse',
                          'chronic_disease', 'shocks_bef_15', 'mother_tongue',
                          'labor_market_status', 'job_stable', 'country'])

### 3. Splitting and standardizing data 

In [11]:
# Splitting into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [12]:
# Making sure the shapes are identical 
X_train.shape

(1962, 81)

In [13]:
X_test.shape

(655, 81)

In [14]:
y_train.shape

(1962,)

In [15]:
y_test.shape

(655,)

In [16]:
# Given that some of these features are in different units, they must be standardized. Below I am instantiating the Standard Scaler 
ss = StandardScaler()

In [17]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
# Transforming X train 
X_train_scaled = ss.transform(X_train)

In [19]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
X_train_scaled = X_train_scaled.tolist()

In [20]:
# Transforming the list into a dataframe 
X_train_scaled = pd.DataFrame(X_train_scaled)

In [21]:
# Looking at the resulting dataframe 
X_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,-1.820271,1.477655,0.050599,-2.272730,-2.039060,-0.903208,0.590989,0.244716,-0.364798,-0.553877,...,0.725025,-0.764731,-0.559690,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
1,-0.197412,1.477655,-0.615677,-1.001608,-1.127026,-2.622018,-2.148899,1.080456,-0.364798,-0.553877,...,-1.379263,-0.764731,-0.559690,3.402488,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
2,0.343541,0.986189,0.716875,0.905075,0.697041,0.242666,0.590989,0.662586,0.449461,2.323544,...,0.725025,1.307649,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
3,-1.279318,0.003256,0.050599,-0.366047,0.241024,1.388539,0.043012,1.498327,2.077979,-0.553877,...,0.725025,-0.764731,-0.559690,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
4,0.343541,0.003256,-0.615677,-0.366047,0.697041,-0.903208,-0.504966,-1.844635,0.449461,-0.553877,...,0.725025,-0.764731,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,-0.738365,0.003256,-1.281952,-1.637169,0.697041,0.242666,-1.052944,-1.008895,-0.364798,-0.553877,...,0.725025,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
1958,-0.738365,0.003256,0.716875,0.905075,2.065091,-0.903208,-1.052944,-0.173154,-0.364798,-0.553877,...,0.725025,1.307649,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
1959,0.884494,-1.962608,1.383150,-3.543851,1.153058,-0.903208,-1.600921,-1.844635,0.449461,-0.553877,...,0.725025,1.307649,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
1960,-1.279318,0.986189,1.383150,1.540636,0.241024,1.388539,0.043012,1.498327,1.263720,2.323544,...,0.725025,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900


In [22]:
# Looking at the column names to create a dictionary
X_train.columns

Index(['ses_15', 'extraversion_av', 'conscientiousness_avg', 'openness_av',
       'stability_av', 'agreeableness_av', 'grit_av', 'decision_av',
       'hostile_av', 'risk', 'age', 'BMI', 'hour_earnings',
       'total_hr_worked_week', 'highest_ISCED_PIAAC', 'years_educ',
       'wealth_index', 'overqualified', 'dropout', 'in_school_1.0',
       'owns_house_2', 'owns_house_3', 'house_beds_2', 'house_beds_3',
       'house_kitchen_1', 'reported_social_status_1',
       'reported_social_status_2', 'reported_social_status_3',
       'reported_social_status_4', 'reported_social_status_5',
       'reported_social_status_6', 'got_pr_transf_1', 'got_pu_transf_1',
       'part_in_training_1.0', 'offdays_ill_1.0', 'healthinsurance_1.0',
       'speak_other_languaje_1.0', 'lives_w_mother_1.0', 'lives_w_father_1.0',
       'read_overall_1.0', 'read_overall_2.0', 'read_overall_3.0',
       'write_overall_1.0', 'write_overall_2.0', 'write_overall_3.0',
       'numeracy_overall_1.0', 'numeracy_overa

In [23]:
# Setting the dictionary with column names 
scalednames = {0:'ses_15', 1: 'extraversion_av', 2: 'conscientiousness_avg', 3: 'openness_av',
               4:'stability_av', 5: 'agreeableness_av', 6: 'grit_av', 7: 'decision_av',
               8: 'hostile_av', 9: 'risk', 10: 'age', 11: 'BMI', 12: 'hour_earnings',
               13: 'total_hr_worked_week', 14: 'highest_ISCED_PIAAC', 15: 'years_educ',
               16: 'wealth_index', 17: 'overqualified', 18: 'dropout', 19: 'in_school_1.0',
               20: 'owns_house_2', 21: 'owns_house_3', 22: 'house_beds_2', 23: 'house_beds_3',
               24: 'house_kitchen_1', 25: 'reported_social_status_1', 26: 'reported_social_status_2', 
               27: 'reported_social_status_3', 28: 'reported_social_status_4', 29: 'reported_social_status_5', 
               30: 'reported_social_status_6', 31: 'got_pr_transf_1', 32: 'got_pu_transf_1',33: 'part_in_training_1.0', 
               34: 'offdays_ill_1.0', 35: 'healthinsurance_1.0', 36: 'speak_other_languaje_1.0', 
               37: 'lives_w_mother_1.0', 38: 'lives_w_father_1.0', 39: 'read_overall_1.0', 40: 'read_overall_2.0', 
               41: 'read_overall_3.0', 42: 'write_overall_1.0', 43: 'write_overall_2.0', 44: 'write_overall_3.0',
               45: 'numeracy_overall_1.0', 46: 'numeracy_overall_2.0', 47: 'numeracy_overall_3.0', 48: 'supervise_1.0', 
               49: 'computer_use_overall_1.0', 50: 'computer_use_overall_2.0', 51: 'computer_use_overall_3.0', 
               52: 'think_learn_work_1.0', 53: 'think_learn_work_2.0', 54: 'think_learn_work_3.0', 55: 'autonomy_at_work_1.0',
               56: 'autonomy_at_work_2.0', 57:'autonomy_at_work_3.0', 58: 'repetitiveness_at_work_1.0', 59: 'repetitiveness_at_work_2.0',
               60: 'repetitiveness_at_work_3.0', 61: 'physical_demand_work_1.0', 62: 'physical_demand_work_2.0', 
               63: 'physical_demand_work_3.0', 64: 'has_children_1.0', 65: 'hh_size_2', 66: 'gender_1', 67: 'has_spouse_1',
               68: 'chronic_disease_1.0', 69: 'shocks_bef_15_1.0', 70: 'mother_tongue_1.0', 71: 'labor_market_status_1.0', 
               72: 'job_stable_1.0', 73: 'highest_ISCED_PIAAC_1', 74: 'highest_ISCED_PIAAC_2', 75: 'highest_ISCED_PIAAC_3', 
               76: 'highest_ISCED_PIAAC_5', 77: 'highest_ISCED_PIAAC_6',78: 'highest_ISCED_PIAAC_7', 79: 'highest_ISCED_PIAAC_8', 80: 'dropout_1.0'}

In [24]:
# Rename columns 
X_train_scaled.rename(columns = scalednames, inplace = True)

In [25]:
# Confirming 
X_train_scaled.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
0,-1.820271,1.477655,0.050599,-2.27273,-2.03906,-0.903208,0.590989,0.244716,-0.364798,-0.553877,...,0.725025,-0.764731,-0.55969,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
1,-0.197412,1.477655,-0.615677,-1.001608,-1.127026,-2.622018,-2.148899,1.080456,-0.364798,-0.553877,...,-1.379263,-0.764731,-0.55969,3.402488,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749
2,0.343541,0.986189,0.716875,0.905075,0.697041,0.242666,0.590989,0.662586,0.449461,2.323544,...,0.725025,1.307649,-0.55969,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749
3,-1.279318,0.003256,0.050599,-0.366047,0.241024,1.388539,0.043012,1.498327,2.077979,-0.553877,...,0.725025,-0.764731,-0.55969,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
4,0.343541,0.003256,-0.615677,-0.366047,0.697041,-0.903208,-0.504966,-1.844635,0.449461,-0.553877,...,0.725025,-0.764731,-0.55969,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749


In [26]:
# Confirming that the shape is intact
X_train.shape

(1962, 81)

In [27]:
# Transforming the X test data 
X_test.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
2576,3.0,3.333333,3.666667,4.0,4.0,4.0,3.666667,3.25,2.0,4.0,...,0,0,1,0,0,0,0,0,0,0
1033,3.0,2.333333,3.0,4.0,2.0,2.333333,2.333333,2.75,2.5,1.0,...,1,1,0,0,1,0,0,0,0,0
2217,1.0,3.666667,4.0,2.333333,1.666667,3.0,4.0,4.0,2.5,4.0,...,1,0,1,0,0,0,0,0,0,1
1124,7.0,1.666667,3.666667,3.666667,2.0,3.0,3.333333,3.25,2.5,1.0,...,0,0,0,0,1,0,0,0,0,0
1945,3.0,2.333333,3.666667,3.0,3.666667,3.333333,2.333333,3.5,1.0,1.0,...,1,1,0,0,1,0,0,0,0,0


In [28]:
# Checking the shape of data 
X_test.shape

(655, 81)

In [29]:
# Transforming X test 
X_test_transf = ss.transform(X_test)

In [30]:
# Looking at the result
X_test_transf

array([[-0.73836509,  0.49472239,  0.71687457, ..., -0.07508751,
        -0.03194383, -0.67489971],
       [-0.73836509, -0.97967592, -0.61567662, ..., -0.07508751,
        -0.03194383, -0.67489971],
       [-1.82027122,  0.98618872,  1.38314993, ..., -0.07508751,
        -0.03194383,  1.48170164],
       ...,
       [-0.73836509,  0.49472239,  0.05059874, ..., -0.07508751,
        -0.03194383,  1.48170164],
       [ 0.34354104,  0.49472239, -1.94822781, ..., -0.07508751,
        -0.03194383,  1.48170164],
       [ 0.34354104, -0.48820959, -0.61567662, ..., -0.07508751,
        -0.03194383, -0.67489971]])

In [31]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
X_test_transf = X_test_transf.tolist()

In [32]:
# Transforming the list into a dataframe 
X_test_transf = pd.DataFrame(X_test_transf)

In [33]:
# Looking at the resulting dataframe 
X_test_transf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,-0.738365,0.494722,0.716875,1.540636,2.065091,1.388539,1.138967,0.244716,0.449461,2.323544,...,-1.379263,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
1,-0.738365,-0.979676,-0.615677,1.540636,-0.671009,-1.476145,-1.052944,-0.591024,1.263720,-0.553877,...,0.725025,1.307649,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
2,-1.820271,0.986189,1.383150,-1.637169,-1.127026,-0.330271,1.686944,1.498327,1.263720,2.323544,...,0.725025,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
3,1.425447,-1.962608,0.716875,0.905075,-0.671009,-0.330271,0.590989,0.244716,1.263720,-0.553877,...,-1.379263,-0.764731,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
4,-0.738365,-0.979676,0.716875,-0.366047,1.609075,0.242666,-1.052944,0.662586,-1.179057,-0.553877,...,0.725025,1.307649,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,0.343541,0.494722,0.716875,0.905075,-2.039060,-0.330271,1.686944,1.498327,0.449461,-0.553877,...,-1.379263,-0.764731,-0.559690,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
651,0.343541,0.986189,-2.614503,-1.637169,1.153058,-0.903208,-1.052944,-1.008895,-0.364798,2.323544,...,-1.379263,-0.764731,-0.559690,3.402488,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.674900
652,-0.738365,0.494722,0.050599,0.269514,-0.214993,-0.903208,-0.504966,1.080456,0.449461,-0.553877,...,0.725025,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
653,0.343541,0.494722,-1.948228,0.269514,-1.127026,-0.330271,-1.052944,-1.426765,0.449461,-0.553877,...,-1.379263,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702


In [34]:
# Checking the shape of standardized test data 
X_test_transf.shape

(655, 81)

In [35]:
# Renaming columns 
X_test_transf.rename(columns = scalednames, inplace = True)

In [36]:
X_test_transf.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
0,-0.738365,0.494722,0.716875,1.540636,2.065091,1.388539,1.138967,0.244716,0.449461,2.323544,...,-1.379263,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749
1,-0.738365,-0.979676,-0.615677,1.540636,-0.671009,-1.476145,-1.052944,-0.591024,1.26372,-0.553877,...,0.725025,1.307649,-0.55969,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749
2,-1.820271,0.986189,1.38315,-1.637169,-1.127026,-0.330271,1.686944,1.498327,1.26372,2.323544,...,0.725025,-0.764731,1.786703,-0.293903,-0.740635,-0.420672,-0.290821,-0.075088,-0.031944,1.481702
3,1.425447,-1.962608,0.716875,0.905075,-0.671009,-0.330271,0.590989,0.244716,1.26372,-0.553877,...,-1.379263,-0.764731,-0.55969,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749
4,-0.738365,-0.979676,0.716875,-0.366047,1.609075,0.242666,-1.052944,0.662586,-1.179057,-0.553877,...,0.725025,1.307649,-0.55969,-0.293903,1.350193,-0.420672,-0.290821,-0.075088,-0.031944,-0.6749


## 4. Multiclass Classification 

**4.1. Linear Regression**

In [37]:
# Instantiating a linear regression model just to test if it works at all (though this is not a classification model)
linear_regression = LinearRegression()

In [38]:
# Cross validating 
# Train
cross_val_score(linear_regression, X_train_scaled, y_train, cv = 5).mean()

0.08303937886295434

In [39]:
# Fitting the model 
linear_regression.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [40]:
linear_regression.score(X_train_scaled, y_train)

0.16080154701634775

In [41]:
# Generating predictions
y_predlinreg = linear_regression.predict(X_test_transf)

In [42]:
# Evaluating the model. This is not a good model 
r2_score(y_test, y_predlinreg)

0.08063252696846746

**4.2. Multiclass Logistic Regression:**

In [43]:
# Instantiating the model 
logreg = LogisticRegressionCV(multi_class='multinomial', solver = 'saga', penalty='l1')

In [44]:
cross_val_score(logreg, X_train_scaled, y_train).mean()



0.2818825770355566

In [45]:
# Fitting the model 
logreg.fit(X_train_scaled, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='multinomial', n_jobs=None,
                     penalty='l1', random_state=None, refit=True, scoring=None,
                     solver='saga', tol=0.0001, verbose=0)

In [46]:
# Making predictions 
y_predlogreg = logreg.predict(X_test_transf)

In [47]:
# Evaluating the model on the train data 
logreg.score(X_train_scaled, y_train)

0.3195718654434251

In [48]:
# Evaluating the model on the test data -- Accuracy 
logreg.score(X_test_transf, y_test)

0.29465648854961835

In [49]:
# Evaluating the model on the test data -- precision 
precision_score(y_test,y_predlogreg, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.1690477089654347

In [50]:
# Evaluating the model on the test data -- recall 
recall_score(y_test,y_predlogreg, average = 'macro')

0.14027081497685817

**4.3. K-Nearest Neighbors**

In [51]:
# Instantiating model 
knn = KNeighborsClassifier(n_neighbors = 20, algorithm='auto')

In [52]:
# Crossvalidating 
cross_val_score(knn, X_train_scaled, y_train).mean()



0.2415843367693433

In [53]:
# Fitting the model 
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [54]:
# Making predictions 
y_predknn = knn.predict(X_test_transf)

In [55]:
# Evaluating model 
knn.score(X_train_scaled, y_train)

0.3343527013251784

In [56]:
knn.score(X_test_transf, y_test)

0.2381679389312977

In [57]:
# Evaluating the model on the test data -- precision 
precision_score(y_test,y_predknn, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.13235572977704185

In [58]:
# Evaluating the model on the test data -- recall 
recall_score(y_test,y_predknn, average = 'macro')

0.12726236303168306

**4.4. Decision Tree:**

In [59]:
# Instantiating model 
dec_tree = DecisionTreeClassifier(random_state = 42, max_depth = 2, min_samples_split = 2, min_samples_leaf = 2)

In [60]:
# Crossvalidating 
cross_val_score(dec_tree, X_train_scaled, y_train).mean()



0.2665825333612269

In [61]:
# Fitting the model 
dec_tree.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [62]:
# Evaluating the model 
dec_tree.score(X_train_scaled, y_train)

0.28338430173292556

In [63]:
dec_tree.score(X_test_transf, y_test)

0.2885496183206107

In [64]:
# Making predictions 
y_preddt = dec_tree.predict(X_test_transf)

In [65]:
# Evaluating the model on the test data -- precision 
precision_score(y_test,y_preddt, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.06414837828450394

In [66]:
# Evaluating the model on the test data -- recall 
recall_score(y_test,y_preddt, average = 'macro')

0.13424041873912387

In [67]:
# Gridsearching for hyperparameter tunning 
grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [1, 2 ,3, 10],
                                  'min_samples_split': [2, 3, 4, 5, 20, 30],
                                  'min_samples_leaf': [2, 20, 40]},
                    cv = 5,
                    verbose = 1)

In [68]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    2.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 10],
                         'min_samples_leaf': [2, 20, 40],
                         

In [69]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [70]:
grid.best_score_

0.26452599388379205

**4.5. Random Forest:**

In [71]:
# Instantiating the model 
rf = RandomForestClassifier(max_depth = 5, min_samples_leaf=50, min_samples_split=5)

In [72]:
# Crossvalidating 
cross_val_score(rf, X_train_scaled, y_train).mean()



0.26549767854239065

In [73]:
# Fitting the model 
rf.fit(X_train_scaled, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=50, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [74]:
# Evaluating the model 
rf.score(X_train_scaled, y_train)

0.3241590214067278

In [75]:
rf.score(X_test_transf, y_test)

0.2595419847328244

In [76]:
# Making predictions 
y_predrf = rf.predict(X_test_transf)

In [77]:
# Evaluating the model on the test data -- precision 
precision_score(y_test,y_predrf, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.16868867774978327

In [78]:
# Evaluating the model on the test data -- recall 
recall_score(y_test,y_predrf, average = 'macro')

0.12093209051615054

In [79]:
# Gridsearching for hyperparameter tunning 
gridrf = GridSearchCV(estimator = RandomForestClassifier(),
                    param_grid = {'max_depth': [2,3,4,5, 50, 100, 150],
                                  'min_samples_split': [2,3,4, 5, 40, 50, 100],
                                  'min_samples_leaf': [40, 45, 50, 55, 100, 150, 200]},
                    cv = 5,
                    verbose = 1)

In [82]:
gridrf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 343 candidates, totalling 1715 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1715 out of 1715 | elapsed:   32.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [83]:
gridrf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=40, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [84]:
gridrf.best_score_

0.29408766564729866

## 5. Binary Classification 

In [85]:
# Looking at the values of y
colstep['life_satisfaction'].value_counts()

10.0    662
8.0     618
7.0     402
9.0     385
6.0     209
5.0     199
4.0      80
3.0      34
1.0      19
2.0       9
Name: life_satisfaction, dtype: int64

In [86]:
# Creating a new column to binarize 
colstep['binary_life_satisfaction'] = colstep['life_satisfaction']

In [87]:
# Binarizing so that 0 = 5 or less, and 1 = 6 or more 
unhappy = [1,2,3,4,5]
happy = [6,7,8,9,10]

for number in unhappy: 
    colstep['binary_life_satisfaction'].replace(number,0, inplace=True)
    
for number in happy:
    colstep['binary_life_satisfaction'].replace(number,1, inplace=True)

In [88]:
# Confirming new values of y
colstep['binary_life_satisfaction'].value_counts()

1.0    2276
0.0     341
Name: binary_life_satisfaction, dtype: int64

In [89]:
# Setting y for as a binary variable 
yb = colstep['binary_life_satisfaction']

In [90]:
# Setting the X values. Excluding life satisfaction and other features that have dummies. 
Xb = colstep.drop(columns=['life_satisfaction','binary_life_satisfaction', 'in_school', 'owns_house', 'house_beds',
                          'house_kitchen','reported_social_status', 'got_pr_transf', 
                          'got_pu_transf', 'part_in_training','life_satisfaction', 
                          'offdays_ill', 'healthinsurance','speak_other_languaje', 
                          'lives_w_mother', 'lives_w_father','read_overall', 
                          'write_overall', 'numeracy_overall', 'supervise', 
                          'computer_use_overall', 'think_learn_work', 'autonomy_at_work',
                          'repetitiveness_at_work', 'physical_demand_work', 
                          'has_children', 'hh_size', 'gender', 'has_spouse',
                          'chronic_disease', 'shocks_bef_15', 'mother_tongue',
                          'labor_market_status', 'job_stable', 'country'])

In [91]:
# Splitting into train and test 
Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, stratify=y, random_state = 42)

In [92]:
# Making sure the shapes are identical 
Xb_train.shape

(1962, 81)

In [93]:
Xb_test.shape

(655, 81)

In [94]:
yb_train.shape

(1962,)

In [95]:
yb_test.shape

(655,)

In [96]:
ss.fit(Xb_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [97]:
# Transforming X train 
Xb_train_scaled = ss.transform(Xb_train)

In [98]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
Xb_train_scaled = Xb_train_scaled.tolist()

In [99]:
# Transforming the list into a dataframe 
Xb_train_scaled = pd.DataFrame(Xb_train_scaled)

In [100]:
# Looking at the resulting dataframe 
Xb_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,-0.748998,0.487166,-0.603607,1.544888,0.653568,-2.077228,-2.126707,-1.026783,-1.187047,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
1,0.346443,-1.019644,-1.261905,-0.375131,0.197720,1.385898,0.047088,-0.600153,0.448477,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,1.347190,-0.433187,-0.283543,-0.075088,0.0,-0.664520
2,-0.201278,0.487166,0.054690,-0.375131,1.565264,0.808710,-0.496361,1.532998,-0.369285,-0.563959,...,-1.402936,-0.787495,-0.549476,-0.308003,-0.742286,2.308470,-0.283543,-0.075088,0.0,-0.664520
3,-0.748998,-0.015104,1.371286,0.264875,-1.625672,-0.345665,0.590536,0.253108,3.719524,2.292723,...,-1.402936,-0.787495,-0.549476,3.246718,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
4,1.441884,-0.517374,0.712988,-0.375131,0.653568,-0.922853,0.590536,-2.306673,-1.187047,-0.563959,...,0.712791,1.269849,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,-0.201278,0.989436,-0.603607,0.264875,0.653568,-0.345665,-0.496361,-1.026783,0.448477,2.292723,...,0.712791,1.269849,-0.549476,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
1958,0.346443,0.487166,-1.261905,0.264875,-0.713976,-1.500041,0.047088,-0.600153,0.448477,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,-0.664520
1959,1.441884,-0.015104,0.712988,0.264875,-1.169824,0.808710,1.133985,1.106368,-1.187047,-0.563959,...,-1.402936,-0.787495,-0.549476,3.246718,-0.742286,-0.433187,-0.283543,-0.075088,0.0,-0.664520
1960,0.894164,0.487166,0.054690,-0.375131,0.653568,-1.500041,-0.496361,-1.453413,-0.369285,-0.563959,...,-1.402936,-0.787495,-0.549476,-0.308003,1.347190,-0.433187,-0.283543,-0.075088,0.0,-0.664520


In [101]:
# Rename columns 
Xb_train_scaled.rename(columns = scalednames, inplace = True)

In [102]:
# Confirming 
Xb_train_scaled.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
0,-0.748998,0.487166,-0.603607,1.544888,0.653568,-2.077228,-2.126707,-1.026783,-1.187047,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
1,0.346443,-1.019644,-1.261905,-0.375131,0.19772,1.385898,0.047088,-0.600153,0.448477,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,1.34719,-0.433187,-0.283543,-0.075088,0.0,-0.66452
2,-0.201278,0.487166,0.05469,-0.375131,1.565264,0.80871,-0.496361,1.532998,-0.369285,-0.563959,...,-1.402936,-0.787495,-0.549476,-0.308003,-0.742286,2.30847,-0.283543,-0.075088,0.0,-0.66452
3,-0.748998,-0.015104,1.371286,0.264875,-1.625672,-0.345665,0.590536,0.253108,3.719524,2.292723,...,-1.402936,-0.787495,-0.549476,3.246718,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
4,1.441884,-0.517374,0.712988,-0.375131,0.653568,-0.922853,0.590536,-2.306673,-1.187047,-0.563959,...,0.712791,1.269849,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845


In [103]:
# Confirming that the shape is intact
Xb_train.shape

(1962, 81)

In [104]:
# Transforming the X test data 
Xb_test.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
43,4.0,3.0,3.333333,3.666667,1.666667,3.666667,3.666667,3.75,1.0,1.0,...,0,0,1,0,0,0,0,0,0,0
311,6.0,2.666667,3.0,3.333333,1.666667,3.333333,3.666667,3.75,1.0,1.0,...,1,1,0,0,0,1,0,0,0,1
294,8.0,3.666667,3.666667,4.0,3.333333,3.333333,2.666667,4.0,1.0,2.0,...,0,0,0,0,1,0,0,0,0,0
164,3.0,2.0,3.333333,1.666667,2.666667,2.333333,3.0,2.25,1.5,2.0,...,1,1,0,1,0,0,0,0,0,1
720,3.0,2.333333,3.0,3.333333,3.666667,3.0,3.333333,3.25,1.0,1.0,...,1,1,0,0,1,0,0,0,0,0


In [105]:
# Checking the shape of data 
Xb_test.shape

(655, 81)

In [106]:
# Transforming X test 
Xb_test_transf = ss.transform(Xb_test)

In [107]:
# Looking at the result
Xb_test_transf

array([[-0.20127757, -0.01510394,  0.0546902 , ..., -0.07508751,
         0.        , -0.66452018],
       [ 0.89416374, -0.5173737 , -0.60360723, ..., -0.07508751,
         0.        ,  1.5048452 ],
       [ 1.98960506,  0.98943595,  0.7129881 , ..., -0.07508751,
         0.        , -0.66452018],
       ...,
       [-0.20127757,  0.98943595, -0.60360723, ..., -0.07508751,
         0.        ,  1.5048452 ],
       [-0.20127757,  0.98943595,  0.7129881 , ..., -0.07508751,
         0.        , -0.66452018],
       [ 0.89416374,  1.49170572,  0.7129881 , ..., -0.07508751,
         0.        , -0.66452018]])

In [108]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
Xb_test_transf = Xb_test_transf.tolist()

In [109]:
# Transforming the list into a dataframe 
Xb_test_transf = pd.DataFrame(Xb_test_transf)

In [110]:
# Looking at the resulting dataframe 
Xb_test_transf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,-0.201278,-0.015104,0.054690,0.904882,-1.169824,0.808710,1.133985,1.106368,-1.187047,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,-0.664520
1,0.894164,-0.517374,-0.603607,0.264875,-1.169824,0.231522,1.133985,1.106368,-1.187047,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,-0.742286,2.308470,-0.283543,-0.075088,0.0,1.504845
2,1.989605,0.989436,0.712988,1.544888,1.109416,0.231522,-0.496361,1.532998,-1.187047,0.388268,...,-1.402936,-0.787495,-0.549476,-0.308003,1.347190,-0.433187,-0.283543,-0.075088,0.0,-0.664520
3,-0.748998,-1.521914,0.054690,-2.935157,0.197720,-1.500041,0.047088,-1.453413,-0.369285,0.388268,...,0.712791,1.269849,-0.549476,3.246718,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
4,-0.748998,-1.019644,-0.603607,0.264875,1.565264,-0.345665,0.590536,0.253108,-1.187047,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,1.347190,-0.433187,-0.283543,-0.075088,0.0,-0.664520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,0.346443,0.989436,-0.603607,0.264875,1.109416,-0.345665,0.047088,-0.173522,-1.187047,-0.563959,...,0.712791,-0.787495,-0.549476,-0.308003,-0.742286,2.308470,-0.283543,-0.075088,0.0,-0.664520
651,0.894164,1.491706,-0.603607,1.544888,-0.258128,-0.922853,0.047088,1.532998,-1.187047,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,-0.664520
652,-0.201278,0.989436,-0.603607,-0.375131,0.197720,-0.345665,1.133985,0.253108,1.266239,-0.563959,...,0.712791,-0.787495,-0.549476,-0.308003,-0.742286,2.308470,-0.283543,-0.075088,0.0,1.504845
653,-0.201278,0.989436,0.712988,1.544888,0.197720,-0.345665,0.047088,-0.173522,1.266239,-0.563959,...,-1.402936,-0.787495,-0.549476,-0.308003,1.347190,-0.433187,-0.283543,-0.075088,0.0,-0.664520


In [111]:
# Checking the shape of standardized test data 
Xb_test_transf.shape

(655, 81)

In [112]:
# Renaming columns 
Xb_test_transf.rename(columns = scalednames, inplace = True)

In [113]:
Xb_test_transf.head()

Unnamed: 0,ses_15,extraversion_av,conscientiousness_avg,openness_av,stability_av,agreeableness_av,grit_av,decision_av,hostile_av,risk,...,labor_market_status_1.0,job_stable_1.0,highest_ISCED_PIAAC_1,highest_ISCED_PIAAC_2,highest_ISCED_PIAAC_3,highest_ISCED_PIAAC_5,highest_ISCED_PIAAC_6,highest_ISCED_PIAAC_7,highest_ISCED_PIAAC_8,dropout_1.0
0,-0.201278,-0.015104,0.05469,0.904882,-1.169824,0.80871,1.133985,1.106368,-1.187047,-0.563959,...,-1.402936,-0.787495,1.819914,-0.308003,-0.742286,-0.433187,-0.283543,-0.075088,0.0,-0.66452
1,0.894164,-0.517374,-0.603607,0.264875,-1.169824,0.231522,1.133985,1.106368,-1.187047,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,-0.742286,2.30847,-0.283543,-0.075088,0.0,1.504845
2,1.989605,0.989436,0.712988,1.544888,1.109416,0.231522,-0.496361,1.532998,-1.187047,0.388268,...,-1.402936,-0.787495,-0.549476,-0.308003,1.34719,-0.433187,-0.283543,-0.075088,0.0,-0.66452
3,-0.748998,-1.521914,0.05469,-2.935157,0.19772,-1.500041,0.047088,-1.453413,-0.369285,0.388268,...,0.712791,1.269849,-0.549476,3.246718,-0.742286,-0.433187,-0.283543,-0.075088,0.0,1.504845
4,-0.748998,-1.019644,-0.603607,0.264875,1.565264,-0.345665,0.590536,0.253108,-1.187047,-0.563959,...,0.712791,1.269849,-0.549476,-0.308003,1.34719,-0.433187,-0.283543,-0.075088,0.0,-0.66452


**5.1. Logistic Regression:**

In [114]:
# Instantiating the model 
logregb = LogisticRegressionCV()

In [115]:
cross_val_score(logregb, Xb_train_scaled, yb_train).mean()



0.869521472607865

In [116]:
# Fitting the model 
logregb.fit(Xb_train_scaled, yb_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [117]:
# Making predictions 
yb_predlogreg = logregb.predict(Xb_test_transf)

In [118]:
# Evaluating the model on the train data 
logregb.score(Xb_train_scaled, yb_train)

0.8695208970438328

In [119]:
# Evaluating the model on the test data -- Accuracy 
logregb.score(Xb_test_transf, yb_test)

0.8702290076335878

In [120]:
# Evaluating the model on the test data -- precision 
precision_score(yb_test,yb_predlogreg)

0.8702290076335878

In [121]:
# Evaluating the model on the test data -- recall 
recall_score(yb_test,yb_predlogreg)

1.0

In [123]:
# Getting the coefficients of each feature
logreg_coeff = logregb.coef_

In [124]:
# Transforming array into list 
coefflist = logreg_coeff.tolist()

In [127]:
coefflist_dict = {'ses_15': 0.011730366364670643,'extraversion_av': 0.0020437756896333295, 'conscientiousness_avg':-0.0026939583472740027,
                  'openness_av': 0.003135005978507152, 'stability_av': 0.00926570123112178, 'agreeableness_av':  -0.0008441411283670881, 
                  'grit_av': 5.255796897318039e-05, 'decision_av':-0.0016908247549418751, 'hostile_av':-0.009853920217525734,
                  'risk': 0.00220078079397234, 'age': -0.006502148477732048, 'BMI':-0.0037880583162444924, 
                  'hour_earnings':0.003500988890624197, 'total_hr_worked_week': 0.0013463511751898912, 
                  'highest_ISCED_PIAAC':  0.009293666527996527, 'years_educ': 0.009958670999604175, 'wealth_index': 0.011178524323997867,
                  'overqualified': 0.0026306533329802147, 'dropout': -0.007608582541146908, 'in_school_1.0': 0.006482566391723629,
                  'owns_house_2': -0.0009692653638657233,  'owns_house_3': -0.0019928521987479116, 'house_beds_2': 0.002527790355866229, 
                  'house_beds_3': 0.0005969075311255434, 'house_kitchen_1':  0.0029800729408532116, 'reported_social_status_1': -0.0015416985595649232,
                  'reported_social_status_2': -0.0016307561987770715, 'reported_social_status_3':0.0018885169038590555,
                  'reported_social_status_4': 0.0023250996583301944, 'reported_social_status_5':  0.0008305073789931207, 
                  'reported_social_status_6': -0.0007967995852348005, 'got_pr_transf_1':-0.0012289222185559688, 
                  'got_pu_transf_1': 0.0010535802559155927, 'part_in_training_1.0': 0.0033922225553302122, 
                  'offdays_ill_1.0': -0.0026176515006336174, 'healthinsurance_1.0': 0.004566881724752285, 
                  'speak_other_languaje_1.0': 0.0023753513885591956, 'lives_w_mother_1.0': 0.006729690746989042,
                  'lives_w_father_1.0': 0.004693828177922808, 'read_overall_1.0':  -0.002663882333519459, 
                  'read_overall_2.0': 0.0024440361874283794, 'read_overall_3.0':  0.00503218921996437, 
                  'write_overall_1.0': -0.0012933571084133985, 'write_overall_2.0': 0.004445571202491618,
                  'write_overall_3.0': 0.0039504216913238644, 'numeracy_overall_1.0':  -0.003769936277938333, 
                  'numeracy_overall_2.0': -0.0005438287043212347, 'numeracy_overall_3.0': 0.0067292157622586765, 
                  'supervise_1.0': 0.0032848965402674114, 'computer_use_overall_1.0': 0.0025505222933598585, 
                  'computer_use_overall_2.0': -0.0006047571703609363, 'computer_use_overall_3.0': 0.007987156717592354, 
                  'think_learn_work_1.0': -0.0007807794558005746, 'think_learn_work_2.0': 4.5833322737041844e-05, 
                  'think_learn_work_3.0': 0.0029273844551001736, 'autonomy_at_work_1.0': -0.0017365228820458024,
                  'autonomy_at_work_2.0': 0.0004467482695593863, 'autonomy_at_work_3.0': 0.00031940306970755603,
                  'repetitiveness_at_work_1.0': 0.000967950409829372, 'repetitiveness_at_work_2.0': 0.0025198481990571877,
                  'repetitiveness_at_work_3.0': 0.0013568669344034062, 'physical_demand_work_1.0': 0.0015440604814668908,
                  'physical_demand_work_2.0': -0.0005777081861490973, 'physical_demand_work_3.0': -0.0027559987016972072,
                  'has_children_1.0': -0.005499922621748637, 'hh_size_2': -0.0010936739604645793, 'gender_1': -0.0033205423235886707, 
                  'has_spouse_1':  0.0005152335440568022, 'chronic_disease_1.0': -0.005203472884668551, 
                  'shocks_bef_15_1.0':  -0.006556304312036174, 'mother_tongue_1.0': 0.0009955575094859235, 
                  'labor_market_status_1.0': 0.002518678541096377, 'job_stable_1.0': 0.004317813459780939, 
                  'highest_ISCED_PIAAC_1':  -0.003928882843300594, 'highest_ISCED_PIAAC_2': 0.0020530105581655837,
                  'highest_ISCED_PIAAC_3': 0.0010743510358527451,'highest_ISCED_PIAAC_5': 0.004967646367368348, 
                  'highest_ISCED_PIAAC_6': 0.0034892689862388486, 'highest_ISCED_PIAAC_7': 0.0016496416839190389, 
                  'highest_ISCED_PIAAC_8': 0.0, 'dropout_1.0': -0.00760858254114690}                

In [152]:
for item, value in coefflist_dict.items(): 
    coeff = np.exp(value)
    if coeff >= 1.005: 
        print(f'As {item} increases by 1, people are {coeff} times as likely to rate their life satisfaction under the happy category.')
    else: 
        print(0)
        

As ses_15 increases by 1, people are 1.0117994369222985 times as likely to rate their life satisfaction under the happy category.
0
0
0
As stability_av increases by 1, people are 1.009308760730173 times as likely to rate their life satisfaction under the happy category.
0
0
0
0
0
0
0
0
0
As highest_ISCED_PIAAC increases by 1, people are 1.0093369867439776 times as likely to rate their life satisfaction under the happy category.
As years_educ increases by 1, people are 1.0100084235830273 times as likely to rate their life satisfaction under the happy category.
As wealth_index increases by 1, people are 1.0112412374893984 times as likely to rate their life satisfaction under the happy category.
0
0
As in_school_1.0 increases by 1, people are 1.0065036237024487 times as likely to rate their life satisfaction under the happy category.
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
As lives_w_mother_1.0 increases by 1, people are 1.0067523859978735 times as likely to rate their life satisfaction under t

**5.2. K-Nearest Neighbors**

In [248]:
# Instantiating model 
knnb = KNeighborsClassifier()

In [249]:
# Crossvalidating 
cross_val_score(knnb, Xb_train_scaled, yb_train).mean()



0.8552510702660845

In [250]:
# Fitting the model 
knnb.fit(Xb_train_scaled, yb_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [251]:
# Making predictions 
yb_predknn = knnb.predict(Xb_test_transf)

In [252]:
# Evaluating model 
knnb.score(Xb_train_scaled, yb_train)

0.8827726809378186

In [253]:
knnb.score(Xb_test_transf, yb_test)

0.8641221374045801

In [254]:
# Evaluating the model on the test data -- precision 
precision_score(yb_test,yb_predknn)

0.878740157480315

In [257]:
# Evaluating the model on the test data -- recall 
recall_score(yb_test,yb_predknn)

0.9789473684210527

**5.3. Decision Tree:**

In [267]:
# Instantiating model 
dec_treeb = DecisionTreeClassifier(random_state = 42, max_depth=1, min_samples_leaf=2, min_samples_split=2,)


In [259]:
# Crossvalidating 
cross_val_score(dec_treeb, Xb_train_scaled, yb_train).mean()



0.7716635562971871

In [274]:
# Fitting the model 
dec_treeb.fit(Xb_train_scaled, yb_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [275]:
# Evaluating the model 
dec_treeb.score(Xb_train_scaled, yb_train)

0.8695208970438328

In [276]:
dec_treeb.score(Xb_test_transf, yb_test)

0.8702290076335878

In [277]:
# Making predictions 
yb_preddt = dec_treeb.predict(Xb_test_transf)

In [280]:
# Evaluating the model on the test data -- precision 
precision_score(yb_test,yb_preddt)

0.8702290076335878

In [281]:
# Evaluating the model on the test data -- recall 
recall_score(yb_test,yb_preddt)

1.0

In [268]:
# Gridsearching for hyperparameter tunning 
gridb = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [1, 2 ,3, 4,5,6,7,8,9],
                                  'min_samples_split': [2, 6,7,8,9,10,15],
                                  'min_samples_leaf': [2, 3,4,5,6,7,8,9,10]},
                    cv = 5,
                    verbose = 1)

In [269]:
gridb.fit(Xb_train_scaled, yb_train)

Fitting 5 folds for each of 567 candidates, totalling 2835 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2835 out of 2835 | elapsed:   24.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9,

In [270]:
gridb.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [271]:
gridb.best_score_

0.8695208970438328

**5.4. Random Forest:**

In [296]:
# Instantiating the model 
rfb = RandomForestClassifier(max_depth = 2, min_samples_leaf=40, min_samples_split=2)


In [297]:
# Crossvalidating 
cross_val_score(rfb, Xb_train_scaled, yb_train).mean()



0.869521472607865

In [298]:
# Fitting the model 
rfb.fit(Xb_train_scaled, yb_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=40, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [299]:
# Evaluating the model 
rfb.score(Xb_train_scaled, yb_train)

0.8695208970438328

In [300]:
rfb.score(Xb_test_transf, yb_test)

0.8702290076335878

In [301]:
# Making predictions 
yb_predrf = rfb.predict(Xb_test_transf)

In [302]:
# Evaluating the model on the test data -- precision 
precision_score(yb_test,yb_predrf)

0.8702290076335878

In [303]:
# Evaluating the model on the test data -- recall 
recall_score(yb_test,yb_predrf)

1.0

In [290]:
# Gridsearching for hyperparameter tunning 
gridrfb = GridSearchCV(estimator = RandomForestClassifier(),
                    param_grid = {'max_depth': [2,3,4,5, 50, 100, 150],
                                  'min_samples_split': [2,3,4, 5, 40, 50, 100],
                                  'min_samples_leaf': [40, 45, 50, 55, 100, 150, 200]},
                    cv = 5,
                    verbose = 1)

In [304]:
#gridrfb.fit(Xb_train_scaled, yb_train)

In [294]:
gridrfb.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=40, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [295]:
gridrfb.best_score_

0.8695208970438328