## Imports

In [114]:
# Data
import numpy as np
import pandas as pd

# Visualisations
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Data Frames

In [115]:
df_2015 = pd.read_csv('data/2015.csv')
df_2016 = pd.read_csv('data/2016.csv')
df_2017 = pd.read_csv('data/2017.csv')

In [155]:
df_2015.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204


In [156]:
df_2016.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137


In [157]:
df_2017.head(3)

Unnamed: 0,Country,Region,Happiness.Rank,Happiness.Score,Whisker.low,Whisker.high,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,Western Europe,1,7.537,7.479556,7.594445,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,Western Europe,2,7.522,7.462272,7.581728,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,Western Europe,3,7.504,7.38597,7.62203,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715


### Adding `Region` back in to `df_2017`

In [118]:
def region_dict(pd_data_frame):
    
    """ This function captures the region of a country from previous data frames """
    
    output = dict()
    
    for _, row in pd_data_frame.iterrows():
        if row["Country"] not in output:
            output[row["Country"]] = row["Region"]
    
    return output

In [119]:
dict_1 = region_dict(df_2015)
dict_2 = region_dict(df_2016)

In [120]:
def join_dicts(dict_1, dict_2):
    
    """ This fucntion will join 2 dictionaries and remove duplicates """
    
    for key, value in dict_2.items():
        if key not in dict_1:
            dict_1[key] = value
    
    return dict_1

dict_3 = join_dicts(dict_1, dict_2)

In [132]:
def add_region(pd_data_frame):
    
    """ This function will add the region back into the dataframe """
    # set regions series     
    regions = []
    for country in pd_data_frame["Country"]:
        try: 
            regions.append(dict_3[country])
        except:
            regions.append("N/A")
            
    # set region column to regions series    
    pd_data_frame['Region'] = regions
    return pd_data_frame

df_2017 = add_region(df_2017)

df_2017.head(2)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual,Region
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,Western Europe
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,Western Europe


### I don't like the way `df_2017` is formatted!

In [145]:
df_2017 = df_2017[[
    "Country","Region","Happiness.Rank","Happiness.Score","Whisker.low","Whisker.high","Economy..GDP.per.Capita.",
    "Family","Health..Life.Expectancy.","Freedom","Generosity","Trust..Government.Corruption.","Dystopia.Residual"
                  ]]

In [146]:
df_2017.head()

Unnamed: 0,Country,Region,Happiness.Rank,Happiness.Score,Whisker.low,Whisker.high,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,Western Europe,1,7.537,7.479556,7.594445,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,Western Europe,2,7.522,7.462272,7.581728,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,Western Europe,3,7.504,7.38597,7.62203,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,Western Europe,4,7.494,7.426227,7.561772,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,Western Europe,5,7.469,7.410458,7.527542,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [147]:
df_2017.columns = df_2015.columns

ValueError: Length mismatch: Expected axis has 13 elements, new values have 12 elements

In [144]:
df_2017.head()

Unnamed: 0,Country,Region,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,Western Europe,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,Western Europe,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,Western Europe,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,Western Europe,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,Western Europe,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


I will leave these dataframes unmerged for now

## The Problem

What can I estimate with this data?
- This could be a regression model; features like `Happiness Rank` of different years can be used to predict values in the future with a regression model.
- This could also be a classification model; one could use several features to guess a country.
- Data can be clustered; one could see if region affects feature values.

## Data Normalisation

In [83]:
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(housing_features)
print(X_minmax)

NameError: name 'housing_features' is not defined

## Data Scaling

## Splitting Data

## Training Data