In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [46]:
df = pd.read_csv('./Data/data_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,region,year,happiness_score,economy_(gdp_per_capita),family,health_(life_expectancy),freedom,trust_(government_corruption),generosity,dystopia_residual,food_supply_(kcal/capita/day),"crude_birth_rate_(births_per_1,000_population)",deaths_-_unsafe_water_source_per_100k_people,deaths_-_conflict_and_terrorism
0,0,Switzerland,Western Europe,2015,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,3460.0,10.364,0.030155,0.0
1,1,Iceland,Western Europe,2015,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,3548.0,12.933,0.0213,0.0
2,2,Denmark,Western Europe,2015,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,3315.0,10.411,0.071516,0.035288
3,3,Norway,Western Europe,2015,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,3435.0,11.397,0.065375,0.0
4,4,Canada,North America,2015,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,3438.0,10.71,0.068045,0.0


In [47]:
df = df.drop(columns=['Unnamed: 0'])

In [48]:
df['region'].value_counts()

Sub-Saharan Africa                 117
Central and Eastern Europe          87
Latin America and Caribbean         68
Western Europe                      63
Middle East and Northern Africa     58
Southeastern Asia                   26
Southern Asia                       21
Eastern Asia                        18
North America                        6
Australia and New Zealand            6
Name: region, dtype: int64

In [49]:
country_to_numeric = {'Sub-Saharan Africa': 1, 'Central and Eastern Europe': 2, 'Latin America and Caribbean': 3, 
                     'Western Europe': 4, 'Middle East and Northern Africa': 5, 'Southeastern Asia': 6,
                     'Southern Asia': 7, 'Eastern Asia': 8, 'North America': 9, 'Australia and New Zealand': 10}

df['region'] = df['region'].replace(country_to_numeric)

In [50]:
df.dtypes

country                                            object
region                                              int64
year                                                int64
happiness_score                                   float64
economy_(gdp_per_capita)                          float64
family                                            float64
health_(life_expectancy)                          float64
freedom                                           float64
trust_(government_corruption)                     float64
generosity                                        float64
dystopia_residual                                 float64
food_supply_(kcal/capita/day)                     float64
crude_birth_rate_(births_per_1,000_population)    float64
deaths_-_unsafe_water_source_per_100k_people      float64
deaths_-_conflict_and_terrorism                   float64
dtype: object

In [51]:
X = df.drop(columns=['country', 'region'])
y = df['region']

In [52]:
# Instantiate PolynomialFeatures
poly = PolynomialFeatures(include_bias=False)
poly.fit(X)

PolynomialFeatures(include_bias=False)

In [53]:
# Create X_poly
X_poly = poly.transform(X)
X_poly

array([[2.01500000e+03, 7.58700000e+00, 1.39651000e+00, ...,
        9.09347787e-04, 0.00000000e+00, 0.00000000e+00],
       [2.01500000e+03, 7.56100000e+00, 1.30232000e+00, ...,
        4.53702482e-04, 0.00000000e+00, 0.00000000e+00],
       [2.01500000e+03, 7.52700000e+00, 1.32548000e+00, ...,
        5.11451980e-03, 2.52364419e-03, 1.24523518e-03],
       ...,
       [2.01700000e+03, 3.34899998e+00, 5.11135876e-01, ...,
        1.67796819e+03, 3.26349282e+00, 6.34719147e-03],
       [2.01700000e+03, 3.46199989e+00, 7.77153134e-01, ...,
        5.13788301e-02, 5.17295652e+01, 5.20826946e+04],
       [2.01700000e+03, 2.69300008e+00, 0.00000000e+00, ...,
        2.25926389e+04, 6.10988891e+03, 1.65234095e+03]])

In [54]:
poly.get_feature_names(X.columns)

['year',
 'happiness_score',
 'economy_(gdp_per_capita)',
 'family',
 'health_(life_expectancy)',
 'freedom',
 'trust_(government_corruption)',
 'generosity',
 'dystopia_residual',
 'food_supply_(kcal/capita/day)',
 'crude_birth_rate_(births_per_1,000_population)',
 'deaths_-_unsafe_water_source_per_100k_people',
 'deaths_-_conflict_and_terrorism',
 'year^2',
 'year happiness_score',
 'year economy_(gdp_per_capita)',
 'year family',
 'year health_(life_expectancy)',
 'year freedom',
 'year trust_(government_corruption)',
 'year generosity',
 'year dystopia_residual',
 'year food_supply_(kcal/capita/day)',
 'year crude_birth_rate_(births_per_1,000_population)',
 'year deaths_-_unsafe_water_source_per_100k_people',
 'year deaths_-_conflict_and_terrorism',
 'happiness_score^2',
 'happiness_score economy_(gdp_per_capita)',
 'happiness_score family',
 'happiness_score health_(life_expectancy)',
 'happiness_score freedom',
 'happiness_score trust_(government_corruption)',
 'happiness_score g

In [55]:
# View X_poly in a DataFrame
pd.DataFrame(X_poly, columns=poly.get_feature_names(X.columns))

Unnamed: 0,year,happiness_score,economy_(gdp_per_capita),family,health_(life_expectancy),freedom,trust_(government_corruption),generosity,dystopia_residual,food_supply_(kcal/capita/day),...,food_supply_(kcal/capita/day)^2,"food_supply_(kcal/capita/day) crude_birth_rate_(births_per_1,000_population)",food_supply_(kcal/capita/day) deaths_-_unsafe_water_source_per_100k_people,food_supply_(kcal/capita/day) deaths_-_conflict_and_terrorism,"crude_birth_rate_(births_per_1,000_population)^2","crude_birth_rate_(births_per_1,000_population) deaths_-_unsafe_water_source_per_100k_people","crude_birth_rate_(births_per_1,000_population) deaths_-_conflict_and_terrorism",deaths_-_unsafe_water_source_per_100k_people^2,deaths_-_unsafe_water_source_per_100k_people deaths_-_conflict_and_terrorism,deaths_-_conflict_and_terrorism^2
0,2015.0,7.587,1.396510,1.349510,0.941430,0.665570,0.419780,0.296780,2.517380,3460.000000,...,1.197160e+07,35859.440000,104.337663,0.000000,107.412496,0.312531,0.000000,0.000909,0.000000,0.000000
1,2015.0,7.561,1.302320,1.402230,0.947840,0.628770,0.141450,0.436300,2.702010,3548.000000,...,1.258830e+07,45886.284000,75.573440,0.000000,167.262489,0.275477,0.000000,0.000454,0.000000,0.000000
2,2015.0,7.527,1.325480,1.360580,0.874640,0.649380,0.483570,0.341390,2.492040,3315.000000,...,1.098922e+07,34512.465000,237.075112,116.979355,108.388921,0.744552,0.367382,0.005115,0.002524,0.001245
3,2015.0,7.522,1.459000,1.330950,0.885210,0.669730,0.365030,0.346990,2.465310,3435.000000,...,1.179922e+07,39148.695000,224.562651,0.000000,129.891609,0.745077,0.000000,0.004274,0.000000,0.000000
4,2015.0,7.427,1.326290,1.322610,0.905630,0.632970,0.329570,0.458110,2.451760,3438.000000,...,1.181984e+07,36820.980000,233.939662,0.000000,114.704100,0.728765,0.000000,0.004630,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,2017.0,3.471,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061,2215.000000,...,4.906225e+06,71017.330000,89938.284527,70.574146,1027.971844,1301.851593,1.021557,1648.700380,1.293727,0.001015
466,2016.0,3.069,0.747190,0.148660,0.629940,0.069120,0.172330,0.483970,0.817890,3136.093023,...,9.835079e+06,76912.681395,721.336946,967201.307764,601.475625,5.641028,7563.746323,0.052905,70.937712,95116.503578
467,2017.0,3.349,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130,2396.000000,...,5.740816e+06,88831.700000,98147.372104,190.887554,1374.555625,1518.703598,2.953738,1677.968193,3.263493,0.006347
468,2017.0,3.462,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574,3136.093023,...,9.835079e+06,75626.883256,710.855032,715707.649401,581.533225,5.466123,5503.436868,0.051379,51.729565,52082.694602


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42, stratify=Y)

In [57]:
lr = LinearRegression()

In [59]:
lr.fit(X_train, y_train)

LinearRegression()

In [60]:
lr.score(X_train, y_train), lr.score(X_test, y_test), cross_val_score(lr, X_train, y_train).mean()

(0.6952908754968175, -11.984025553206264, -5.168356178143373)