# EDA Dataset until 2015 to 2017

Import libraries that we use

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sys
from datetime import datetime

: 

Read the csv files

In [None]:
sys.path.append('data/')

data_2015 = pd.read_csv('data/2015.csv')
data_2016 = pd.read_csv('data/2016.csv')
data_2017 = pd.read_csv('data/2017.csv')
data_2018 = pd.read_csv('data/2018.csv')
data_2019 = pd.read_csv('data/2019.csv')

: 

## General information

In [None]:
data_2015.info()
data_2015.head()

: 

In [None]:
data_2015.describe()

: 

In [None]:
data_2016.info()
data_2016.head()

: 

In [None]:
data_2016.describe()

: 

In [None]:
data_2017.info()
data_2017.head()

: 

In [None]:
data_2017.describe()

: 

In [None]:
data_2018.info()
data_2018.head()

: 

In [None]:
data_2018.describe()

: 

In [None]:
data_2019.info()
data_2019.head()

: 

In [None]:
data_2019.describe()

: 

Lets search missing values

In [None]:
data_2015.isnull().sum()

: 

In [None]:
data_2016.isnull().sum()

: 

In [None]:
data_2017.isnull().sum()

: 

In [None]:
data_2018.isnull().sum()

: 

Here we see that the 2018 data has a null entry in the Corruption Perception column, and since we can't leave this entry null, we simply remove it.

In [None]:
data_2018 = data_2018.dropna()

: 

In [None]:
data_2019.isnull().sum()

: 

Now, check duplicate entries

In [None]:
print(data_2015.duplicated().sum())
print(data_2016.duplicated().sum())
print(data_2017.duplicated().sum())
print(data_2018.duplicated().sum())
print(data_2019.duplicated().sum())

: 

With this information we know that the columns for the years 2017, 2018 and 2019 have errors in some columns, let's fix it.

In [None]:
# 2015
data_2015['Social support'] = data_2015['Family']
data_2015['Year'] = 2015

data_2015 = data_2015.drop(columns = ['Family'], axis = 1)

data_2015.info()

: 

In [None]:
# 2016
data_2016['Social support'] = data_2016['Family']
data_2016['Year'] = 2016

data_2016 = data_2016.drop(columns = ['Family', 'Lower Confidence Interval', 'Upper Confidence Interval'], axis = 1)

data_2016.info()

: 

In [None]:
# 2017
data_2017['Social support'] = data_2017['Family']
data_2017['Trust (Government Corruption)'] = data_2017['Trust..Government.Corruption.']
data_2017['Dystopia Residual'] = data_2017['Dystopia.Residual']
data_2017['Health (Life Expectancy)'] = data_2017['Health..Life.Expectancy.']
data_2017['Economy (GDP per Capita)'] = data_2017['Economy..GDP.per.Capita.']
data_2017['Happiness Score'] = data_2017['Happiness.Score']
data_2017['Happiness Rank'] = data_2017['Happiness.Rank']
data_2017['Year'] = 2017

data_2017 = data_2017.drop(columns = ['Family', 'Trust..Government.Corruption.', 'Dystopia.Residual', 'Health..Life.Expectancy.', 'Economy..GDP.per.Capita.', 'Happiness.Score', 'Happiness.Rank', 'Whisker.high', 'Whisker.low'], axis = 1)

data_2017.info()

: 

In [None]:
# 2018

data_2018['Happiness Rank'] = data_2018['Overall rank']
data_2018['Happiness Score'] = data_2018['Score']
data_2018['Economy (GDP per Capita)'] = data_2018['GDP per capita']
data_2018['Health (Life Expectancy)'] = data_2018['Healthy life expectancy']
data_2018['Freedom'] = data_2018['Freedom to make life choices']
data_2018['Trust (Government Corruption)'] = data_2018['Perceptions of corruption']
data_2018['Year'] = 2018

data_2018 = data_2018.drop(columns = ['Overall rank', 'Score', 'GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption'], axis = 1)

data_2018.info()

: 

In [None]:
# 2019

data_2019['Happiness Rank'] = data_2019['Overall rank']
data_2019['Happiness Score'] = data_2019['Score']
data_2019['Economy (GDP per Capita)'] = data_2019['GDP per capita']
data_2019['Health (Life Expectancy)'] = data_2019['Healthy life expectancy']
data_2019['Freedom'] = data_2019['Freedom to make life choices']
data_2019['Trust (Government Corruption)'] = data_2019['Perceptions of corruption']
data_2019['Year'] = 2019

data_2019 = data_2019.drop(columns = ['Overall rank', 'Score', 'GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption'], axis = 1)

data_2019.info()

: 

Previously, we have seen that in the last few years there is only one column with the country (2017) or called country or region (2018, 2019). We will leave only one column called country or region.

In [None]:
data_2015['Country or region'] = data_2015['Country']
data_2016['Country or region'] = data_2016['Country']
data_2017['Country or region'] = data_2017['Country']

data_2015 = data_2015.drop(columns = ['Region'], axis = 1)
data_2015 = data_2015.drop(columns = ['Country'], axis = 1)
data_2016 = data_2016.drop(columns = ['Region'], axis = 1)
data_2016 = data_2016.drop(columns = ['Country'], axis = 1)
data_2017 = data_2017.drop(columns = ['Country'], axis = 1)

: 

In [None]:
data_2015['Country or region'].value_counts()

: 

In [None]:
#  2015 Happiness Score distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_2015['Happiness Score'], kde=True, color='blue')
plt.title('Distribution of Happiness Scores for 2015')
plt.xlabel('Happiness Score')
plt.ylabel('Frequency')
plt.show()

: 

### Now I combined the data

In [None]:
all_data = pd.concat([
    data_2015,
    data_2016,
    data_2017,
    data_2018,
    data_2019
])

: 

In [None]:
all_data.head()

: 

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='Year', y='Happiness Score', data=all_data)
plt.title('Happiness Score Distribution by Year')
plt.xlabel('Year')
plt.ylabel('Happiness Score')
plt.show()

: 

In [None]:
all_data.to_csv('data/combined_data.csv', index=False)

: 