# Predicting Happiness 

## Regression analysis with 14 basic varaibles 

### 1. Imports:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

### 2. Dataset 

In [2]:
# Reading data in 
col = pd.read_csv('STEPColombia.csv') 

In [3]:
# Taking a first look at the data
col.head()

Unnamed: 0,in_school,house_type,owns_house,house_beds,house_kitchen,reported_social_status,got_pr_transf,job_sector,part_in_training,life_satisfaction,...,highest_ISCED_PIAAC,years_educ,wealth_index,overqualified,country,worked_lastweek,dropout,got_pu_transf,present,supervise
0,2.0,3,6,5,1,3,2,999.0,2.0,9.0,...,3,11.0,-0.172015,0.0,Colombia,2.0,0.0,1,0.0,0.0
1,2.0,3,6,2,1,3,2,799.0,1.0,9.0,...,5,13.0,0.303367,1.0,Colombia,2.0,0.0,0,1.0,1.0
2,2.0,1,6,4,1,3,2,999.0,2.0,5.0,...,1,8.0,0.25764,0.0,Colombia,2.0,1.0,1,0.0,0.0
3,2.0,3,6,3,1,3,2,466.0,2.0,7.0,...,3,14.5,0.303686,0.0,Colombia,2.0,1.0,0,0.0,0.0
4,2.0,1,1,2,1,3,2,477.0,2.0,7.0,...,3,11.0,1.061927,1.0,Colombia,2.0,0.0,0,0.0,1.0


### 3.  Data subset

In [4]:
# Selecting the columns I want to use in this first module. In addition to Y (life satisfaction), I am selecting 2 variables 
# per category
col1 = col[['reported_social_status', 'part_in_training', 'life_satisfaction', 'extraversion_av', 'agreeableness_av', 'has_children', 
     'gender', 'age', 'has_spouse', 'chronic_disease', 'BMI', 'hour_earnings', 'total_hr_worked_week', 'years_educ', 
     'wealth_index']]

In [5]:
col1.head()

Unnamed: 0,reported_social_status,part_in_training,life_satisfaction,extraversion_av,agreeableness_av,has_children,gender,age,has_spouse,chronic_disease,BMI,hour_earnings,total_hr_worked_week,years_educ,wealth_index
0,3,2.0,9.0,3.666667,4.0,1.0,1,58,0,0.0,31.1,0.0,0,11.0,-0.172015
1,3,1.0,9.0,3.666667,3.666667,1.0,0,36,1,0.0,24.700001,45833.332031,24,13.0,0.303367
2,3,2.0,5.0,3.666667,3.0,1.0,1,61,0,1.0,29.299999,0.0,0,8.0,0.25764
3,3,2.0,7.0,3.666667,4.0,1.0,1,39,1,0.0,27.299999,0.0,0,14.5,0.303686
4,3,2.0,7.0,1.333333,2.0,1.0,0,23,1,0.0,24.5,4980.842773,30,11.0,1.061927


### 4. Variable adjustment 

In [6]:
# The variable 'part_in_training' is binary (1=yes,2=no). I think it is more convenient to set 'no' as zero. 
col1['part_in_training'].value_counts()

2.0    2212
1.0     405
Name: part_in_training, dtype: int64

In [7]:
# Setting 'no' as zero
col1['part_in_training'].replace(2,0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [8]:
# Reported social status is grouped in 7 categories. I need to create dummies so that this column is not taken as an integer in the model 
col1['reported_social_status'].value_counts()

2    1288
3     783
1     328
4     160
5      46
7       6
6       6
Name: reported_social_status, dtype: int64

In [9]:
# Creating dummies 
sesdummies = pd.get_dummies(col1['reported_social_status'], prefix = 'ReportedSES', drop_first=True)

In [10]:
# Concatenating the dummies to the data frame
col1 = pd.concat([col1, sesdummies], axis = 1)

In [11]:
# Creating a dummy for training column 
traindummy = pd.get_dummies(col1['part_in_training'], prefix= 'part_train', drop_first=True)

In [12]:
traindummy.head()

Unnamed: 0,part_train_1.0
0,0
1,1
2,0
3,0
4,0


In [13]:
# Concatenating the dummies to the data frame
col1 = pd.concat([col1, traindummy], axis = 1)

In [14]:
# Looking at the distribution of values for different columns 
col1['has_children'].value_counts()

1.0    1429
0.0    1188
Name: has_children, dtype: int64

In [15]:
# Creating a dummy for the children column 
childrendummy = pd.get_dummies(col1['has_children'], prefix= 'has_children', drop_first=True)
col1 = pd.concat([col1, childrendummy], axis = 1)

In [16]:
# Creating a dummy for the gender column 
genderdummy = pd.get_dummies(col1['gender'], prefix= 'gender', drop_first=True)
col1 = pd.concat([col1, genderdummy], axis = 1)

In [17]:
# Creating a dummy for the married column 
marrieddummy = pd.get_dummies(col1['has_spouse'], prefix= 'has_spouse', drop_first=True)
col1 = pd.concat([col1, marrieddummy], axis = 1)

In [18]:
# Creating a dummy for the chronic illness column 
illdummy =  pd.get_dummies(col1['chronic_disease'], prefix= 'chronic_disease', drop_first=True)
col1 = pd.concat([col1, illdummy], axis = 1)

In [19]:
# Confirming that all dummies have been concatenated
col1.head()

Unnamed: 0,reported_social_status,part_in_training,life_satisfaction,extraversion_av,agreeableness_av,has_children,gender,age,has_spouse,chronic_disease,...,ReportedSES_3,ReportedSES_4,ReportedSES_5,ReportedSES_6,ReportedSES_7,part_train_1.0,has_children_1.0,gender_1,has_spouse_1,chronic_disease_1.0
0,3,0.0,9.0,3.666667,4.0,1.0,1,58,0,0.0,...,1,0,0,0,0,0,1,1,0,0
1,3,1.0,9.0,3.666667,3.666667,1.0,0,36,1,0.0,...,1,0,0,0,0,1,1,0,1,0
2,3,0.0,5.0,3.666667,3.0,1.0,1,61,0,1.0,...,1,0,0,0,0,0,1,1,0,1
3,3,0.0,7.0,3.666667,4.0,1.0,1,39,1,0.0,...,1,0,0,0,0,0,1,1,1,0
4,3,0.0,7.0,1.333333,2.0,1.0,0,23,1,0.0,...,1,0,0,0,0,0,1,0,1,0


In [20]:
# Dropping the columns that have dummies 
col1.drop(columns = ['reported_social_status', 'part_in_training', 'has_children', 'gender', 'has_spouse', 
                     'chronic_disease' ], inplace=True)

In [21]:
# Looking at the resulting dataframe 
col1.head()

Unnamed: 0,life_satisfaction,extraversion_av,agreeableness_av,age,BMI,hour_earnings,total_hr_worked_week,years_educ,wealth_index,ReportedSES_2,ReportedSES_3,ReportedSES_4,ReportedSES_5,ReportedSES_6,ReportedSES_7,part_train_1.0,has_children_1.0,gender_1,has_spouse_1,chronic_disease_1.0
0,9.0,3.666667,4.0,58,31.1,0.0,0,11.0,-0.172015,0,1,0,0,0,0,0,1,1,0,0
1,9.0,3.666667,3.666667,36,24.700001,45833.332031,24,13.0,0.303367,0,1,0,0,0,0,1,1,0,1,0
2,5.0,3.666667,3.0,61,29.299999,0.0,0,8.0,0.25764,0,1,0,0,0,0,0,1,1,0,1
3,7.0,3.666667,4.0,39,27.299999,0.0,0,14.5,0.303686,0,1,0,0,0,0,0,1,1,1,0
4,7.0,1.333333,2.0,23,24.5,4980.842773,30,11.0,1.061927,0,1,0,0,0,0,0,1,0,1,0


### 5. Variable standardization

In [22]:
# Setting the dependent variable  
y = col1['life_satisfaction']

In [23]:
y.shape

(2617,)

In [24]:
# Setting the independent variables 
X = col1.drop(columns=['life_satisfaction'])

In [25]:
X.shape

(2617, 19)

In [26]:
# Splitting into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [27]:
# Making sure the shapes are identical 
X_train.shape

(1962, 19)

In [28]:
X_test.shape

(655, 19)

In [29]:
y_train.shape

(1962,)

In [30]:
y_test.shape

(655,)

In [31]:
# Given that some of these features are in different units, they must be standardized. Below I am instantiating the Standard Scaler 
ss = StandardScaler()

In [32]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
# Transforming X train 
X_train_scaled = ss.transform(X_train)

In [34]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
X_train_scaled = X_train_scaled.tolist()

In [35]:
# Transforming the list into a dataframe 
X_train_scaled = pd.DataFrame(X_train_scaled)

In [36]:
# Looking at the resulting dataframe 
X_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.477655,-0.903208,0.917563,0.023225,-0.116611,0.203679,-1.524353,-1.143643,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,-0.918687,-0.482894
1,1.477655,-2.622018,-1.400678,-1.848076,-0.336530,-1.030239,-0.236334,-0.809716,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,2.070848
2,0.986189,0.242666,-0.531337,0.047528,-0.055662,1.437596,0.536478,-0.152077,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,-0.482894
3,0.003256,1.388539,0.410448,-0.317011,-0.057966,-0.618933,-1.781957,-0.547738,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,1.088511,-0.482894
4,0.003256,-0.903208,-0.603782,-0.535735,-0.043304,-0.618933,0.278874,-0.131244,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,-0.482894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,0.003256,0.242666,1.859348,-1.167603,0.339106,-0.824586,-1.266749,0.613900,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,-1.212861,-0.918687,2.070848
1958,0.003256,-0.903208,-1.110898,2.016039,-0.179445,1.231944,0.407676,-0.980744,-0.970863,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,1.088511,-0.482894
1959,-1.962608,-0.903208,-0.314002,2.356275,-0.159584,0.614985,-1.009145,-2.597106,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,-1.212861,1.088511,-0.482894
1960,0.986189,1.388539,0.627783,3.984550,-0.153264,1.437596,-1.266749,-0.019336,-0.970863,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,-1.212861,1.088511,-0.482894


In [37]:
# Looking at the column names to create a dictionary
X_train.columns

Index(['extraversion_av', 'agreeableness_av', 'age', 'BMI', 'hour_earnings',
       'total_hr_worked_week', 'years_educ', 'wealth_index', 'ReportedSES_2',
       'ReportedSES_3', 'ReportedSES_4', 'ReportedSES_5', 'ReportedSES_6',
       'ReportedSES_7', 'part_train_1.0', 'has_children_1.0', 'gender_1',
       'has_spouse_1', 'chronic_disease_1.0'],
      dtype='object')

In [38]:
# Changing the column names 
scalednames = {0:'extraversion_av_sc', 1:'agreeableness_av_sc', 2:'age_sc', 3:'BMI_sc', 4:'hour_earnings_sc', 
               5:'total_hr_worked_week_sc', 6:'years_educ_sc', 7:'wealth_index_sc', 8:'ReportedSES_2_sc', 9:'ReportedSES_3_sc', 10:'ReportedSES_4_sc',
               11:'ReportedSES_5', 12:'ReportedSES_6', 13:'ReportedSES_7_sc', 14:'part_train_1.0_sc', 15: 'has_children_1.0', 16: 'gender_1', 17: 'has_spouse_1', 
               18: 'chronic_disease_1.0'}

In [39]:
# Rename columns 
X_train_scaled.rename(columns = scalednames, inplace = True)

In [40]:
# Confirming 
X_train_scaled.head()

Unnamed: 0,extraversion_av_sc,agreeableness_av_sc,age_sc,BMI_sc,hour_earnings_sc,total_hr_worked_week_sc,years_educ_sc,wealth_index_sc,ReportedSES_2_sc,ReportedSES_3_sc,ReportedSES_4_sc,ReportedSES_5,ReportedSES_6,ReportedSES_7_sc,part_train_1.0_sc,has_children_1.0,gender_1,has_spouse_1,chronic_disease_1.0
0,1.477655,-0.903208,0.917563,0.023225,-0.116611,0.203679,-1.524353,-1.143643,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,-0.918687,-0.482894
1,1.477655,-2.622018,-1.400678,-1.848076,-0.33653,-1.030239,-0.236334,-0.809716,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,2.070848
2,0.986189,0.242666,-0.531337,0.047528,-0.055662,1.437596,0.536478,-0.152077,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,-0.482894
3,0.003256,1.388539,0.410448,-0.317011,-0.057966,-0.618933,-1.781957,-0.547738,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,1.088511,-0.482894
4,0.003256,-0.903208,-0.603782,-0.535735,-0.043304,-0.618933,0.278874,-0.131244,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,-0.482894


In [41]:
# Confirming that the shape is intact
X_train.shape

(1962, 19)

In [42]:
# Transforming the X test data 
X_test.head()

Unnamed: 0,extraversion_av,agreeableness_av,age,BMI,hour_earnings,total_hr_worked_week,years_educ,wealth_index,ReportedSES_2,ReportedSES_3,ReportedSES_4,ReportedSES_5,ReportedSES_6,ReportedSES_7,part_train_1.0,has_children_1.0,gender_1,has_spouse_1,chronic_disease_1.0
2576,3.333333,4.0,64,24.0,0.0,0,5.0,-0.41013,0,0,0,0,0,0,0,0,1,0,1
1033,2.333333,2.333333,48,24.4,2944.188477,72,11.0,0.126483,0,1,0,0,0,0,0,0,1,1,0
2217,3.666667,3.0,49,28.700001,2688.172119,60,8.0,1.02861,1,0,0,0,0,0,1,1,0,1,0
1124,1.666667,3.0,25,23.5,0.0,0,12.0,1.088199,0,0,1,0,0,0,0,0,1,0,0
1945,2.333333,3.333333,38,23.299999,9578.543945,48,11.0,1.050043,1,0,0,0,0,0,1,1,0,1,0


In [43]:
# Checking the shape of data 
X_test.shape

(655, 19)

In [44]:
# Transforming X test 
X_test_transf = ss.transform(X_test)

In [45]:
# Looking at the result
X_test_transf

array([[ 0.49472239,  1.38853945,  2.00423846, ...,  0.82449653,
        -0.91868654,  2.07084758],
       [-0.97967592, -1.47614459,  0.84511809, ...,  0.82449653,
         1.08851056, -0.48289406],
       [ 0.98618872, -0.33027089,  0.91756311, ..., -1.21286139,
         1.08851056, -0.48289406],
       ...,
       [ 0.49472239, -0.90320754,  0.41044795, ..., -1.21286139,
         1.08851056, -0.48289406],
       [ 0.49472239, -0.33027089,  0.62778302, ...,  0.82449653,
         1.08851056,  2.07084758],
       [-0.48820959, -1.47614459, -1.11089753, ..., -1.21286139,
        -0.91868654, -0.48289406]])

In [46]:
# Transforming the array into a list so I can convert it into a dataframe and concatenate it to the dataframe with the unscaled features 
X_test_transf = X_test_transf.tolist()

In [47]:
# Transforming the list into a dataframe 
X_test_transf = pd.DataFrame(X_test_transf)

In [48]:
# Looking at the resulting dataframe 
X_test_transf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.494722,1.388539,2.004238,-0.098288,-0.336530,-1.030239,-1.266749,-0.302372,-0.970863,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,0.824497,-0.918687,2.070848
1,-0.979676,-1.476145,0.845118,-0.001078,-0.077536,1.437596,0.278874,0.253140,-0.970863,1.524934,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,0.824497,1.088511,-0.482894
2,0.986189,-0.330271,0.917563,1.043935,-0.100058,1.026291,-0.493938,1.187040,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,2.282293,0.899052,-1.212861,1.088511,-0.482894
3,-1.962608,-0.330271,-0.821117,-0.219801,-0.336530,-1.030239,0.536478,1.248729,-0.970863,-0.655766,4.007601,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,0.824497,-0.918687,-0.482894
4,-0.979676,0.242666,0.120668,-0.268407,0.506073,0.614985,0.278874,1.209229,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,2.282293,0.899052,-1.212861,1.088511,-0.482894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,0.494722,-0.330271,-0.966007,-1.532142,-0.336530,-1.030239,0.278874,1.064355,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,0.824497,-0.918687,-0.482894
651,0.986189,-0.903208,-1.545568,-0.973182,-0.336530,-1.030239,-0.236334,0.829040,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,-1.112282,-1.212861,-0.918687,-0.482894
652,0.494722,-0.903208,0.410448,0.387764,0.030002,-0.618933,-1.266749,-1.377150,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,-1.212861,1.088511,-0.482894
653,0.494722,-0.330271,0.627783,0.071830,-0.336530,-1.030239,-0.493938,-0.539199,1.030012,-0.655766,-0.249526,-0.13477,-0.050546,-0.045198,-0.438156,0.899052,0.824497,1.088511,2.070848


In [49]:
# Checking the shape of standardized test data 
X_test_transf.shape

(655, 19)

In [50]:
# Renaming columns 
X_test_transf.rename(columns = scalednames, inplace = True)

### 5. Logistic regression set up

In [64]:
# Instantiating the model 
logreg = LogisticRegressionCV(multi_class='multinomial', solver = 'newton-cg' )

In [65]:
# Fitting the model 
logreg.fit(X_train_scaled, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='multinomial', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='newton-cg', tol=0.0001, verbose=0)

In [66]:
# Making predictions 
predicted_happiness = logreg.predict(X_test_transf)

### 6. Model evaluation

In [67]:
# Evaluating the model on the train data 
logreg.score(X_train_scaled, y_train)

0.29561671763506625

In [68]:
# Evaluating the model on the test data 
logreg.score(X_test_transf, y_test)

0.27938931297709924