In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import chart_studio.plotly as py 
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [53]:
df = pd.read_csv('happiness.csv')

In [54]:
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [55]:
df.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [57]:
df.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [58]:
#checking out the worldwide distribution of happiness
data6 = dict(type = 'choropleth', 
           locations = df['Country'],
           locationmode = 'country names',
           text = df['Country'],
              z = df['Happiness Rank'], 
          colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Happiness Rank Across the World', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choromap6 = go.Figure(data = [data6], layout=layout)
iplot(choromap6)
fig.show("notebook")

NameError: name 'fig' is not defined

In [None]:
sns.lmplot(x='Happiness Rank', y='Economy (GDP per Capita)', data=df)

In [None]:
sns.lmplot(x='Happiness Rank', y='Family', data=df)

In [None]:
sns.lmplot(x='Happiness Rank', y='Health (Life Expectancy)', data=df)

In [None]:
sns.lmplot(x='Happiness Rank', y='Freedom', data=df)

In [None]:
sns.lmplot(x='Happiness Rank', y='Trust (Government Corruption)', data=df)

In [None]:
sns.lmplot(x='Happiness Rank', y='Generosity', data=df)

## we can infer from the above regression plots that happiness score/rank has a strong linear relationship with the attributes.

In [None]:
above_mean = ['above mean HS' if i>=5.375734 else 'below mean HS' for i in df['Happiness Score']]
sd = pd.DataFrame({'Happiness Score':above_mean})
sd.head()

In [None]:
sns.countplot(sd['Happiness Score'])

So, we can see that the distribution of countries is balanced between above and below mean

In [None]:
df.skew()

In [None]:
sns.lmplot(x='Happiness Rank', y='Dystopia Residual', data=df);

In [None]:
#feature engineering

In [None]:
df['Region'].nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
df['Region'] = l.fit_transform(df['Region'])

In [None]:
df = df.drop(['Country'], axis=1)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap='YlGnBu', annot=True)

In [None]:
#since happiness rank has negative correlation with many attributes, so we will drop it
df = df.drop(['Happiness Rank'], axis=1)

In [None]:
df.shape

In [None]:
#checking and removing outliers and skewness
df.skew()

In [None]:
from scipy.stats import zscore
z=np.abs(zscore(df))
df = df[(z<3).all(axis=1)]

In [None]:
df.skew()

In [None]:
df.shape

In [None]:
a = df.copy()

In [None]:
a.skew()

In [None]:
for col in a.columns:
    if a.skew().loc[col]<-0.55:
            a[col] = np.square(a[col])

In [None]:
a.skew()

In [None]:
for col in a.columns:
    if a.skew().loc[col]>0.55:
        while a.skew().loc[col]>0.55:
            a[col] = np.log1p(a[col]) 

In [None]:
a.skew()

In [None]:
a.head()

## machine learning 

In [None]:
X_old = a.drop(['Happiness Score'], axis=1)
y = a['Happiness Score']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X_old)
X = pd.DataFrame(X, columns=X_old.columns)

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(r2_score(y_test, pred))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=81)
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
r2_score(y_test, pred)

In [None]:
cross_val_score(lr, X_train,y_train, cv=5, scoring='r2').mean()

In [None]:
cross_val_score(lr, X_test,y_test, cv=5, scoring='r2').mean()

We have got a very good prediction as can be seen from the r2 score and the model is not over/under
fitting as well. So, moving onto interpreting the coefficients.

In [None]:
print(f'estimated intercept is {lr.intercept_}')

In [None]:
print(f'the coefficients are {lr.coef_}')

In [None]:
df_coef = pd.DataFrame(lr.coef_,index=X.columns, columns=['Coef'])

In [None]:
df_coef

In [None]:
#plotting actual vs predicted values to see whether predictions are correct or not
sns.scatterplot(x=df['Happiness Score'], y=lr.predict(X))
plt.title('Predicted happiness score vs actual happiness score')

In [None]:
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
#saving model
from sklearn.externals import joblib
joblib.dump(lr, 'happiness.pkl')