In [1]:
# importing required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

# sklearn preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



# for ignoring any unwanted warning
import warnings
warnings.filterwarnings('ignore')

# for ignoring any unwanted warning
import warnings
warnings.filterwarnings('ignore')

# set style for the plots
sns.set_theme(style="darkgrid")

In [2]:
# import data
df = pd.read_csv('Happiness.csv')
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [3]:
# Now let's drop the "country" & "Region" column as we don't need this

df.drop(['Country', 'Region'], axis = 1, inplace = True)

In [4]:
# checking Null values
df.isnull().sum()

Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

Here we can see that that is no missing values.

In previous we can see that there are some outliers. Let's find them and take care of them

In [5]:
# For detecting outlier we'll use  "Inter Quartiel Range" (IQR)

def outlier_detection(datacolumn):
    sorted(datacolumn)
    q1, q3= np.percentile(datacolumn , [25,75])
    IQR = q3-q1
    lower_range = q1 -(1.5 * IQR)
    upper_range = q3 + (1.5 * IQR)
    return lower_range,upper_range

In [6]:
# find upperbound & lowerbound "Dystopia Residual"
lowerbound,upperbound = outlier_detection(df['Dystopia Residual'])
print(lowerbound, upperbound)

0.7049024999999998 3.5169225


In [7]:
# Finding where the outliers are

df[(df['Dystopia Residual']<lowerbound) | (df['Dystopia Residual']>upperbound)]

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
13,14,7.187,0.04176,1.02054,0.91451,0.81444,0.48181,0.21312,0.14074,3.60214
71,72,5.474,0.05051,1.38604,1.05818,1.01328,0.59608,0.37124,0.39478,0.65429
131,132,4.271,0.03751,0.83524,1.01905,0.70806,0.53726,0.09179,0.40828,0.67108
153,154,3.465,0.03464,0.22208,0.7737,0.42864,0.59201,0.55191,0.22628,0.67042
155,156,3.006,0.05015,0.6632,0.47489,0.72193,0.15684,0.18906,0.47179,0.32858


In [8]:
# These are the values which has outlier
# find which country are they are from our main dataset

df.iloc[[13,71,131,153,155]]

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
13,14,7.187,0.04176,1.02054,0.91451,0.81444,0.48181,0.21312,0.14074,3.60214
71,72,5.474,0.05051,1.38604,1.05818,1.01328,0.59608,0.37124,0.39478,0.65429
131,132,4.271,0.03751,0.83524,1.01905,0.70806,0.53726,0.09179,0.40828,0.67108
153,154,3.465,0.03464,0.22208,0.7737,0.42864,0.59201,0.55191,0.22628,0.67042
155,156,3.006,0.05015,0.6632,0.47489,0.72193,0.15684,0.18906,0.47179,0.32858


In [9]:
# 'Mexico, Hong Kong, Sri Lanka, Rwanda, Syria' has outliers
# For this case I'll remove these rows from the dataset

df.drop([df.index[13], df.index[71], df.index[131], df.index[153], df.index[155]], inplace= True)

##### Now let's check for "Trust (Government Corruption)"

In [10]:
# find upperbound & lowerbound "Trust (Government Corruption)l"
lowerbound,upperbound = outlier_detection(df['Trust (Government Corruption)'])
print(lowerbound, upperbound)

-0.11218999999999998 0.34668999999999994


In [11]:
# Finding where the outliers are

df[(df['Trust (Government Corruption)']<lowerbound) | (df['Trust (Government Corruption)']>upperbound)]

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
2,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
5,6,7.406,0.0314,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955
7,8,7.364,0.03157,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2.37119
8,9,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
9,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
16,17,6.946,0.03499,1.56391,1.21963,0.91894,0.61583,0.37798,0.28034,1.96961
19,20,6.901,0.03729,1.42727,1.12575,0.80925,0.64157,0.38583,0.26428,2.24743
23,24,6.798,0.0378,1.52186,1.02,1.02525,0.54252,0.4921,0.31105,1.88501


In [12]:
# Since there are so many values which should not be removed.
# Let's replace those with the median value so that the distribution shouldn't be changed

median = df['Trust (Government Corruption)'].median()

# Now let's use np.where to replace those
df['Trust (Government Corruption)'] = np.where(df['Trust (Government Corruption)']> upperbound, median, df['Trust (Government Corruption)'])
df['Trust (Government Corruption)'] = np.where(df['Trust (Government Corruption)']< lowerbound, median, df['Trust (Government Corruption)'])

We've replaced the outliers from the dataset. With this I've finised feature engineering task. 
In next I'll perform machine learning. Before that let's save the dataset into a new one

In [14]:
# let's seperate the target and input features

df.to_csv('final_df.csv', index = False)