In [1]:
#Import all relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from scipy.stats import zscore
 
## This statement allows the visuals to render within your Jupyter Notebook.
%matplotlib inline

## Loading the data
We can now load the dataset into pandas using the read_csv() function. This converts the CSV file into a Pandas dataframe.

In [2]:
#Read in the csv file and convert to a Pandas dataframe
y2015= pd.read_csv("2015.csv") 
y2016= pd.read_csv("2016.csv") 
y2017= pd.read_csv("2017.csv")
y2018= pd.read_csv("2018.csv")
y2019= pd.read_csv("2019.csv")



### Viewing the dataframe
We can get a quick sense of the size of our dataset by using the shape method. This returns a tuple with the number of rows and columns in the dataset.

In [3]:
y2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [4]:
y2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [5]:
y2017.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [6]:
y2018.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [7]:
y2019.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [8]:
y2015.shape , y2016.shape, y2017.shape, y2018.shape, y2019.shape

((158, 12), (157, 13), (155, 12), (156, 9), (156, 9))

In [9]:
# the six factors contributing to happiness is present in the 5 data sets, some have different naming
# rows in all data sets are around the same amount
# year are not present in all data sets, so if we want concat we must add column for the year as we need this to solve the use case

In [10]:
y2015["Year"]= 2015

In [11]:
y2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015


In [12]:
y2016["Year"]= 2016

In [13]:
y2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939,2016
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463,2016
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137,2016
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465,2016
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596,2016


In [14]:
y2017["Year"]= 2017

In [15]:
y2017.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual,Year
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,2017
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,2017
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,2017
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,2017
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,2017


In [16]:
y2018["Year"]= 2018

In [17]:
y2018.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Year
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,2018
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34,2018
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408,2018
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,2018
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357,2018


In [18]:
y2019["Year"]= 2019

In [19]:
y2019.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Year
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,2019
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,2019
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,2019
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,2019
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,2019


In [20]:
# as you can see same columns have different names
y2015.columns.sort, y2016.columns.sort, y2017.columns.sort,y2018.columns.sort ,y2019.columns.sort

(<bound method Index.sort of Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
        'Standard Error', 'Economy (GDP per Capita)', 'Family',
        'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
        'Generosity', 'Dystopia Residual', 'Year'],
       dtype='object')>,
 <bound method Index.sort of Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
        'Lower Confidence Interval', 'Upper Confidence Interval',
        'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
        'Freedom', 'Trust (Government Corruption)', 'Generosity',
        'Dystopia Residual', 'Year'],
       dtype='object')>,
 <bound method Index.sort of Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
        'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
        'Health..Life.Expectancy.', 'Freedom', 'Generosity',
        'Trust..Government.Corruption.', 'Dystopia.Residual', 'Year'],
       dtype='object')>,
 <bo

In [21]:
# unify colummmns names as a part of consistency
y2015.rename(columns={ "Happiness Rank":"Happiness_rank","Happiness Score": "Happiness_score","Economy (GDP per Capita)": "Economy_GDP","Health (Life Expectancy)": "health_life_expectency","Trust (Government Corruption)":"Trust_in_government","Dystopia Residual": "Dystopia_Residual", "Family":"Family_SocialSupport" }, inplace=True)
y2016.rename(columns={ "Happiness Rank":"Happiness_rank", "Happiness Score": "Happiness_score", "Economy (GDP per Capita)": "Economy_GDP", "Health (Life Expectancy)": "health_life_expectency", "Trust (Government Corruption)":"Trust_in_government", "Dystopia Residual": "Dystopia_Residual" , "Family":"Family_SocialSupport"}, inplace=True)
y2017.rename(columns={ "Happiness.Rank":"Happiness_rank", "Happiness.Score": "Happiness_score", "Economy..GDP.per.Capita.": "Economy_GDP", "Health..Life.Expectancy.": "health_life_expectency", "Trust..Government.Corruption.":"Trust_in_government", "Dystopia.Residual": "Dystopia_Residual" ,"Family":"Family_SocialSupport"}, inplace=True)
y2018.rename(columns={ "Country or region":"Country" , "Overall rank":"Happiness_rank", "Score" : "Happiness_score", "GDP per capita": "Economy_GDP", "Freedom to make life choices":"Freedom", "Healthy life expectancy": "health_life_expectency", "Perceptions of corruption":"Trust_in_government","Social support":"Family_SocialSupport"  }, inplace=True)
y2019.rename(columns={ "Country or region":"Country" , "Overall rank":"Happiness_rank", "Score" : "Happiness_score", "GDP per capita": "Economy_GDP", "Freedom to make life choices":"Freedom", "Healthy life expectancy": "health_life_expectency", "Perceptions of corruption":"Trust_in_government","Social support" :"Family_SocialSupport"  }, inplace=True)

In [22]:
#check columns names
y2015.columns.sort, y2016.columns.sort, y2017.columns.sort,y2018.columns.sort ,y2019.columns.sort

(<bound method Index.sort of Index(['Country', 'Region', 'Happiness_rank', 'Happiness_score',
        'Standard Error', 'Economy_GDP', 'Family_SocialSupport',
        'health_life_expectency', 'Freedom', 'Trust_in_government',
        'Generosity', 'Dystopia_Residual', 'Year'],
       dtype='object')>,
 <bound method Index.sort of Index(['Country', 'Region', 'Happiness_rank', 'Happiness_score',
        'Lower Confidence Interval', 'Upper Confidence Interval', 'Economy_GDP',
        'Family_SocialSupport', 'health_life_expectency', 'Freedom',
        'Trust_in_government', 'Generosity', 'Dystopia_Residual', 'Year'],
       dtype='object')>,
 <bound method Index.sort of Index(['Country', 'Happiness_rank', 'Happiness_score', 'Whisker.high',
        'Whisker.low', 'Economy_GDP', 'Family_SocialSupport',
        'health_life_expectency', 'Freedom', 'Generosity',
        'Trust_in_government', 'Dystopia_Residual', 'Year'],
       dtype='object')>,
 <bound method Index.sort of Index(['Happines

In [23]:
y2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Happiness_rank          156 non-null    int64  
 1   Country                 156 non-null    object 
 2   Happiness_score         156 non-null    float64
 3   Economy_GDP             156 non-null    float64
 4   Family_SocialSupport    156 non-null    float64
 5   health_life_expectency  156 non-null    float64
 6   Freedom                 156 non-null    float64
 7   Generosity              156 non-null    float64
 8   Trust_in_government     155 non-null    float64
 9   Year                    156 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 12.3+ KB


In [24]:
#Now we will concat data sets in one data frame and ignore the index
df_Happiness= pd.concat([y2015,y2016,y2017,y2018,y2019], ignore_index=True)
df_Happiness.sample(50)

Unnamed: 0,Country,Region,Happiness_rank,Happiness_score,Standard Error,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Dystopia_Residual,Year,Lower Confidence Interval,Upper Confidence Interval,Whisker.high,Whisker.low
564,Vietnam,,95,5.103,,0.715,1.365,0.702,0.618,0.079,0.177,,2018,,,,
482,Costa Rica,,13,7.072,,1.01,1.459,0.817,0.632,0.101,0.143,,2018,,,,
117,Sudan,Sub-Saharan Africa,118,4.55,0.0674,0.52107,1.01404,0.36878,0.10081,0.1466,0.19062,2.20857,2015,,,,
114,Zimbabwe,Sub-Saharan Africa,115,4.61,0.0429,0.271,1.03276,0.33475,0.25861,0.08079,0.18987,2.44191,2015,,,,
68,Kosovo,Central and Eastern Europe,69,5.589,0.05018,0.80148,0.81198,0.63132,0.24749,0.04741,0.2831,2.76579,2015,,,,
218,Belarus,Central and Eastern Europe,61,5.802,,1.13062,1.04993,0.63104,0.29091,0.17457,0.13942,2.38582,2016,5.723,5.881,,
559,Jordan,,90,5.161,,0.822,1.265,0.645,0.468,0.134,0.13,,2018,,,,
622,Tanzania,,153,3.303,,0.455,0.991,0.381,0.481,0.097,0.27,,2018,,,,
487,United States,,18,6.886,,1.398,1.471,0.819,0.547,0.133,0.291,,2018,,,,
88,Latvia,Central and Eastern Europe,89,5.098,0.0464,1.11312,1.09562,0.72437,0.29671,0.06332,0.18226,1.62215,2015,,,,


## 1. Data Profiling:
Data profiling is a comprehensive process of examining the data available in an existing dataset and collecting statistics and information about that data. 

In [25]:
df_Happiness.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    782 non-null    object 
 1   Region                     315 non-null    object 
 2   Happiness_rank             782 non-null    int64  
 3   Happiness_score            782 non-null    float64
 4   Standard Error             158 non-null    float64
 5   Economy_GDP                782 non-null    float64
 6   Family_SocialSupport       782 non-null    float64
 7   health_life_expectency     782 non-null    float64
 8   Freedom                    782 non-null    float64
 9   Trust_in_government        781 non-null    float64
 10  Generosity                 782 non-null    float64
 11  Dystopia_Residual          470 non-null    float64
 12  Year                       782 non-null    int64  
 13  Lower Confidence Interval  157 non-null    float64

In [26]:
#as the six factors is important for the analysis I want to make sure which year the null value in Trust_in_government 
slack= df_Happiness[df_Happiness["Trust_in_government"].isnull()]
group_by_year= slack.groupby('Year')
group_by_year.size()

Year
2018    1
dtype: int64

In [27]:
df_Happiness.shape

(782, 17)

In [28]:
df_Happiness.describe()

Unnamed: 0,Happiness_rank,Happiness_score,Standard Error,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Dystopia_Residual,Year,Lower Confidence Interval,Upper Confidence Interval,Whisker.high,Whisker.low
count,782.0,782.0,158.0,782.0,782.0,782.0,782.0,781.0,782.0,470.0,782.0,157.0,157.0,155.0,155.0
mean,78.69821,5.379018,0.047885,0.916047,1.078392,0.612416,0.411091,0.125436,0.218576,2.092717,2016.993606,5.282395,5.481975,5.452326,5.255713
std,45.182384,1.127456,0.017146,0.40734,0.329548,0.248309,0.15288,0.105816,0.122321,0.565772,1.417364,1.148043,1.136493,1.118542,1.14503
min,1.0,2.693,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858,2015.0,2.732,3.078,2.864884,2.521116
25%,40.0,4.50975,0.037268,0.6065,0.869363,0.440183,0.309768,0.054,0.13,1.737975,2016.0,4.327,4.465,4.608172,4.374955
50%,79.0,5.322,0.04394,0.982205,1.124735,0.64731,0.431,0.091,0.201982,2.09464,2017.0,5.237,5.419,5.370032,5.193152
75%,118.0,6.1895,0.0523,1.236187,1.32725,0.808,0.531,0.15603,0.278832,2.455575,2018.0,6.154,6.434,6.1946,6.006527
max,158.0,7.769,0.13693,2.096,1.644,1.141,0.724,0.55191,0.838075,3.83772,2019.0,7.46,7.669,7.62203,7.479556


The process of profiling differs slightly for categorical and numerical variables due to their inherent differences.

**The two main types of data are:**
- Quantitative (numerical) data
- Qualitative (categorical) data

### Data Quality Checks
Data quality checks involve the process of ensuring that the data is accurate, complete, consistent, relevant, and reliable. 


**Here are typical steps involved in checking data quality:**

#### 1. Reliability:
Evaluate the data's source and collection process to determine its trustworthiness.

In [29]:
# data from the Gallup World Poll
#The Gallup World Poll covers more than 160 countries, representing over 98% of the world's population. 
#It uses scientific sampling methods to ensure that the survey data is representative of the national population. So its trustworthy to analyze

#### 2. Timeliness: 
Ensure the data is up-to-date and reflective of the current situation or the period of interest for the analysis.

In [30]:
#its acceptable for analysis

#### 3. Consistency: 

Confirm that the data is consistent within the dataset and across multiple data sources. For example, the same data point should not have different values in different places.


In [31]:
#as I concat different data soureces in one data frame I modified columns names to be consistent 
df_Happiness.info()
#You can see every column have specific type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    782 non-null    object 
 1   Region                     315 non-null    object 
 2   Happiness_rank             782 non-null    int64  
 3   Happiness_score            782 non-null    float64
 4   Standard Error             158 non-null    float64
 5   Economy_GDP                782 non-null    float64
 6   Family_SocialSupport       782 non-null    float64
 7   health_life_expectency     782 non-null    float64
 8   Freedom                    782 non-null    float64
 9   Trust_in_government        781 non-null    float64
 10  Generosity                 782 non-null    float64
 11  Dystopia_Residual          470 non-null    float64
 12  Year                       782 non-null    int64  
 13  Lower Confidence Interval  157 non-null    float64

#### 4. Relevance: 
Assess whether the data is appropriate and applicable for the intended analysis. Data that is not relevant can skew results and lead to incorrect conclusions.

**Key considerations for relevance include:**

> 1. Sample Appropriateness: Confirm that your data sample aligns with your analysis objectives. For instance, utilizing data from the Northern region will not yield accurate insights for the Western region of the Kingdom.
>
> 2. Variable Selection: Any column will not be relevant for our analysis, we can get rid of these using the drop() method. We will set the “axis” argument to 1 since we’re dealing with columns, and set the “inplace” argument to True to make the change permanent.


In [32]:
df_Happiness.head(10)

Unnamed: 0,Country,Region,Happiness_rank,Happiness_score,Standard Error,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Dystopia_Residual,Year,Lower Confidence Interval,Upper Confidence Interval,Whisker.high,Whisker.low
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,,,,
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015,,,,
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015,,,,
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015,,,,
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015,,,,
5,Finland,Western Europe,6,7.406,0.0314,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955,2015,,,,
6,Netherlands,Western Europe,7,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.4761,2.4657,2015,,,,
7,Sweden,Western Europe,8,7.364,0.03157,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2.37119,2015,,,,
8,New Zealand,Australia and New Zealand,9,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425,2015,,,,
9,Australia,Australia and New Zealand,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646,2015,,,,


In [33]:
df_Happiness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    782 non-null    object 
 1   Region                     315 non-null    object 
 2   Happiness_rank             782 non-null    int64  
 3   Happiness_score            782 non-null    float64
 4   Standard Error             158 non-null    float64
 5   Economy_GDP                782 non-null    float64
 6   Family_SocialSupport       782 non-null    float64
 7   health_life_expectency     782 non-null    float64
 8   Freedom                    782 non-null    float64
 9   Trust_in_government        781 non-null    float64
 10  Generosity                 782 non-null    float64
 11  Dystopia_Residual          470 non-null    float64
 12  Year                       782 non-null    int64  
 13  Lower Confidence Interval  157 non-null    float64

In [34]:

slack= df_Happiness[df_Happiness["Standard Error"].isnull()]
group_by_year= slack.groupby('Year')
group_by_year.size()
#after running the above code missing values are in all years except 2015 
# as the usecase is not focusing on the result of this column and its not one of the sex factor . will drop this column in data cleaning

Year
2016    157
2017    155
2018    156
2019    156
dtype: int64

In [35]:
df_Happiness.columns

Index(['Country', 'Region', 'Happiness_rank', 'Happiness_score',
       'Standard Error', 'Economy_GDP', 'Family_SocialSupport',
       'health_life_expectency', 'Freedom', 'Trust_in_government',
       'Generosity', 'Dystopia_Residual', 'Year', 'Lower Confidence Interval',
       'Upper Confidence Interval', 'Whisker.high', 'Whisker.low'],
      dtype='object')

#### 5. Uniqueness: 
Check for and remove duplicate records to prevent skewed analysis results.


In [36]:
df_Happiness.duplicated().sum()


0

In [37]:
# go to delete duplicates columns

#### 6. Completeness: 
Ensure that no critical data is missing. This might mean checking for null values or required fields that are empty.

We will start by checking the dataset for missing or null values. For this, we can use the isna() method which returns a dataframe of boolean values indicating if a field is null or not. To group all missing values by column, we can include the sum() method.

In [38]:
#Display number missing values per column
df_Happiness.isnull().sum().sort_values(ascending = False)

#this result after i drop columns: Dystopia_Residual, Lower Confidence Interval, Upper Confidence Interval, Whisker.high, Whisker.low

Whisker.low                  627
Whisker.high                 627
Upper Confidence Interval    625
Lower Confidence Interval    625
Standard Error               624
Region                       467
Dystopia_Residual            312
Trust_in_government            1
Generosity                     0
Year                           0
Country                        0
health_life_expectency         0
Family_SocialSupport           0
Economy_GDP                    0
Happiness_score                0
Happiness_rank                 0
Freedom                        0
dtype: int64

In [39]:

slack= df_Happiness[df_Happiness["Whisker.high"].isnull()]
group_by_year= slack.groupby('Year')
group_by_year.size()
# this code shows that Whisker.high column have  null in all years except in 2015 because this column exist just in 2015 data set.
#we will drop columns with nulls after checking that it will not effect our analysis, and these columns are not one of the six factors 
#go to data cleaning


Year
2015    158
2016    157
2018    156
2019    156
dtype: int64

In [40]:
df_Happiness.isnull().sum().sort_values(ascending = False)
#region have 467 missing value 

Whisker.low                  627
Whisker.high                 627
Upper Confidence Interval    625
Lower Confidence Interval    625
Standard Error               624
Region                       467
Dystopia_Residual            312
Trust_in_government            1
Generosity                     0
Year                           0
Country                        0
health_life_expectency         0
Family_SocialSupport           0
Economy_GDP                    0
Happiness_score                0
Happiness_rank                 0
Freedom                        0
dtype: int64

In [41]:
#Region just exists in 2015 and 2016 the rest of datasets doesn't have this column 
slack2= df_Happiness[df_Happiness["Region"].isnull()]
group_by_year= slack2.groupby('Year')
group_by_year.size()
#I will drop the region column and I will rely on Country. go to data cleaning

Year
2017    155
2018    156
2019    156
dtype: int64

In [42]:
df_Happiness.isnull().sum()

Country                        0
Region                       467
Happiness_rank                 0
Happiness_score                0
Standard Error               624
Economy_GDP                    0
Family_SocialSupport           0
health_life_expectency         0
Freedom                        0
Trust_in_government            1
Generosity                     0
Dystopia_Residual            312
Year                           0
Lower Confidence Interval    625
Upper Confidence Interval    625
Whisker.high                 627
Whisker.low                  627
dtype: int64

In [43]:
df_Happiness[df_Happiness['Trust_in_government'].isnull()]
#The UAE does not declare their trust in government score, replace it nwith 0 in data cleaning

Unnamed: 0,Country,Region,Happiness_rank,Happiness_score,Standard Error,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Dystopia_Residual,Year,Lower Confidence Interval,Upper Confidence Interval,Whisker.high,Whisker.low
489,United Arab Emirates,,20,6.774,,2.096,0.776,0.67,0.284,,0.186,,2018,,,,


In [63]:
df_Happiness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country                 782 non-null    object 
 1   Happiness_rank          782 non-null    int64  
 2   Happiness_score         782 non-null    float64
 3   Economy_GDP             782 non-null    float64
 4   Family_SocialSupport    782 non-null    float64
 5   health_life_expectency  782 non-null    float64
 6   Freedom                 782 non-null    float64
 7   Trust_in_government     782 non-null    float64
 8   Generosity              782 non-null    float64
 9   Year                    782 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 61.2+ KB


#### 7. Check Accuracy:

Verify that the data is correct and precise. This could involve comparing data samples with known sources or using validation rules.

**The process includes:**
1. Validating the appropriateness of data types for the dataset.
2. Identifying outliers  using established validation  rule

In [62]:
# check columns types 
df_Happiness.dtypes
#data types are correct

Country                    object
Happiness_rank              int64
Happiness_score           float64
Economy_GDP               float64
Family_SocialSupport      float64
health_life_expectency    float64
Freedom                   float64
Trust_in_government       float64
Generosity                float64
Year                        int64
dtype: object

In [61]:
df_Happiness.head(10)

Unnamed: 0,Country,Happiness_rank,Happiness_score,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015
5,Finland,6,7.406,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2015
6,Netherlands,7,7.378,1.32944,1.28017,0.89284,0.61576,0.31814,0.4761,2015
7,Sweden,8,7.364,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2015
8,New Zealand,9,7.286,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2015
9,Australia,10,7.284,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2015


In [47]:
# go to clean them 

In [48]:
# check outliers 

**What is an Outlier?** 
Outlier is an row/observation that appears far away and diverges from an overall pattern in a sample.

**What are the types of Outliers?**
1. Univariate: These outliers can be found when we look at distribution of a single variable
2. Multivariate: are outliers in an n-dimensional space. In order to find them, you have to look at distributions in multi-dimensions. example (hight=100, weight=100) for a person

**What causes Outliers?**
Whenever we come across outliers, the ideal way to tackle them is to find out the reason of having these outliers. The method to deal with them would then depend on the reason of their occurrence.

Let’s understand various types of outliers:

1. Data Entry Errors:- Human errors such as errors caused during data collection, recording, or entry can cause outliers in data.
2. Measurement Error: It is the most common source of outliers. This is caused when the measurement instrument used turns out to be faulty.
3. Data Processing Error: Whenever we perform data mining, we extract data from multiple sources. It is possible that some manipulation or extraction errors may lead to outliers in the dataset.
4. Sampling error: For instance, we have to measure the height of athletes. By mistake, we include a few basketball players in the sample. This inclusion is likely to cause outliers in the dataset.
5. Natural Outlier: When an outlier is not artificial (due to error), it is a natural outlier. For instance: In my last assignment with one of the renowned insurance company, I noticed that the performance of top 50 financial advisors was far higher than rest of the population. Surprisingly, it was not due to any error. Hence, whenever we perform any data mining activity with advisors, we used to treat this segment separately.


**What is the impact of Outliers on a dataset?**


![image.png](https://www.analyticsvidhya.com/wp-content/uploads/2015/02/Outlier_31.png)



**How to detect Outliers?**

1. Most commonly used method to detect outliers is visualization (Univariate Graphical Analysis).

We use 3 common visualization methods:
>- Box-plot: A box plot is a method for graphically depicting groups of numerical data through their quartiles. The box extends from the Q1 to Q3 quartile values of the data, with a line at the median (Q2). The whiskers extend from the edges of the box to show the range of the data. Outlier points are those past the end of the whiskers. Box plots show robust measures of location and spread as well as providing information about symmetry and outliers.
>
>  
>![image.png](https://miro.medium.com/v2/resize:fit:698/format:webp/1*VK5iHA2AB28HSZwWwUbNYg.png)
>
>
>- Histogram
>- Scatter Plot: A scatter plot is a mathematical diagram using Cartesian coordinates to display values for two variables for a set of data. The data are displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis. The points that are far from the population can be termed as an outlier.
>
>  
>![image.png](https://miro.medium.com/v2/resize:fit:4800/format:webp/1*Ov6aH-8yIwNoUxtMFwgx4g.png)
>
>

2. Using statistical method (Univariate Non-Graphical analysis):
>- Any value, which is beyond the range of -1.5 x IQR to 1.5 x IQR
 
![image.png](https://www.whatissixsigma.net/wp-content/uploads/2015/07/Box-Plot-Diagram-to-identify-Outliers-figure-1.png)

>- Use capping methods. Any value which out of range of 5th and 95th percentile can be considered as outlier
>- Data points, three or more standard deviation away from mean are considered outlier: The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured. While calculating the Z-score we re-scale and center the data and look for data points that are too far from zero. These data points which are way too far from zero will be treated as the outliers. In most of the cases, a threshold of 3 or -3 is used i.e if the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers.
> - Outlier detection is merely a special case of the examination of data for influential data points and it also depends on the business understanding


In [49]:
# go to univariate graphical analysis
# go to lesson : data visualisation 1 - chart type section
# then go to univariate graphical analysis
# detect outliers using graphs varbaly

In [50]:
# go to lesson: statistics 1 then statistics 3
# then go to univariate Non graphical analysis
# detect outliers using numerical statistics 

In [51]:
# go to delete ouliers

## 2. Data Cleaning: 

Preliminary findings from data profiling can lead to cleaning the data by:
- Handling missing values
- Correcting errors.
- Dealing with outliers.

-------------------



### Handling missing values:

**Why my data has missing values?**
They may occur at two stages:
1. Data Extraction: It is possible that there are problems with extraction process. Errors at data extraction stage are typically easy to find and can be corrected easily as well.
2. Data collection: These errors occur at time of data collection and are harder to correct.

**Why do we need to handle the missing data?**
To avoid:
- Bias the conclusions.
- Leading the business to make wrong decisions.

**Which are the methods to treat missing values ?**
1. Deletion: we delete rows where any of the variable is missing. Simplicity is one of the major advantage of this method, but this method reduces the power of model because it reduces the sample size.

2. Imputation: is a method to fill in the missing values with estimated ones. This imputation is one of the most frequently used methods.

    2.1. Mean/ Mode/ Median Imputation: It consists of replacing the missing data for a given attribute by the mean or median (quantitative attribute) or mode (qualitative attribute) of all known values of that variable.
    > It can be of two types:
    > - Generalized Imputation: In this case, we calculate the mean or median for all non missing values of that variable then replace missing value with mean or median.
    > - Similar case Imputation: In this case, we calculate average for each group individually of non missing values then replace the missing value based on the group.

    2.2. Constant Value
   
    2.3. Forward Filling
   
    2.4. Backward Filling

6. Prediction Model:  Prediction model is one of the sophisticated method for handling missing data. Here, we create a predictive model to estimate values that will substitute the missing data.  In this case, we divide our data set into two sets: One set with no missing values for the variable and another one with missing values. First data set become training data set of the model while second data set with missing values is test data set and variable with missing values is treated as target variable. Next, we create a model to predict target variable based on other attributes of the training data set and populate missing values of test data set.

> There are 2 drawbacks for this approach:
> - The model estimated values are usually more well-behaved than the true values
> - If there are no relationships with attributes in the data set and the attribute with missing values, then the model will not be precise for estimating missing values.

9. KNN Imputation: In this method of imputation, the missing values of an attribute are imputed using the given number of attributes that are most similar to the attribute whose values are missing. The similarity of two attributes is determined using a distance function. It is also known to have certain advantage & disadvantages.

   > **Advantages:**
   > - k-nearest neighbour can predict both qualitative & quantitative attributes
   > - Creation of predictive model for each attribute with missing data is not required
   > - Attributes with multiple missing values can be easily treated
   > - Correlation structure of the data is taken into consideration

   > **Disadvantage:**
   > - KNN algorithm is very time-consuming in analyzing large database. It searches through all the dataset looking for the most similar instances.
   > - Choice of k-value is very critical. Higher value of k would include attributes which are significantly different from what we need whereas lower value of k implies missing out of significant attributes.

--------------------


In [52]:
df_Happiness.drop(["Standard Error"], axis=1,inplace=True)

In [53]:
# go back to 6th dimention --> Completeness

In [54]:
df_Happiness.drop(["Dystopia_Residual","Lower Confidence Interval", "Upper Confidence Interval", "Whisker.high", "Whisker.low"], axis=1, inplace=True)

In [55]:
df_Happiness.head()

Unnamed: 0,Country,Region,Happiness_rank,Happiness_score,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Year
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015
3,Norway,Western Europe,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015


In [56]:
#I will drop the region column and I will rely on Country 
df_Happiness.drop(columns="Region", inplace= True)

In [57]:
df_Happiness.isnull().sum()

Country                   0
Happiness_rank            0
Happiness_score           0
Economy_GDP               0
Family_SocialSupport      0
health_life_expectency    0
Freedom                   0
Trust_in_government       1
Generosity                0
Year                      0
dtype: int64

In [58]:
df_Happiness['Trust_in_government']= df_Happiness['Trust_in_government'].fillna(0)

In [59]:
df_Happiness[df_Happiness['Trust_in_government'].isnull()]

Unnamed: 0,Country,Happiness_rank,Happiness_score,Economy_GDP,Family_SocialSupport,health_life_expectency,Freedom,Trust_in_government,Generosity,Year


### Correcting errors

-------------------

In [60]:
# go back to 7th dimension Accuracy 

### Dealing with outliers:

**How to remove Outliers?**
Most of the ways to deal with outliers are similar to the methods of missing values like deleting rows, transforming them, binning them, treat them as a separate group, imputing values and other statistical methods. Here, we will discuss the common techniques used to deal with outliers:

1. Deleting rows: We delete outlier values if it is due to data entry error, data processing error or outlier rows are very small in numbers. We can also use trimming at both ends to remove outliers.

2. Imputing: Like imputation of missing values, we can also impute outliers. We can use mean, median, mode imputation methods. Before imputing values, we should analyse if it is natural outlier or artificial. If it is artificial, we can go with imputing values. We can also use statistical model to predict values of outlier rows and after that we can impute it with predicted values.

3. Treat separately: If there are significant number of outliers, we should treat them separately in the statistical model. One of the approach is to treat both groups as two different groups and build individual model for both groups and then combine the output.


## 3. Univariate Analysis: 

This involves examining single variables to understand their characteristics (distribution, central tendency, dispersion, and shape).

We calculate **numerical values** about the data that tells us about the distribution of the data. We also **draw graphs** showing visually how the data is distributed. **To answer the following questions about Features/characteristics of Data:**
- Where is the center of the data? (location)
- How much does the data vary? (scale)
- What is the shape of the data? (shape)

**The benefits of this analysis:**
Statistics summary gives a high-level idea to identify whether the data has any outliers, data entry error, distribution of data such as the data is normally distributed or left/right skewed

**In this step, we will explore variables one by one using following approaches:**

### 1. Univariate Graphical Analysis:
Method to perform uni-variate analysis will depend on whether the variable type is categorical or numerical.

#### I. Categorical Variables:

we’ll use frequency table to understand distribution of each category
- Bar Chart (Ordinal) - Orderd
- Pie Chart (Nominal) - non Orderd

#### II. Numerical Variables:

we need to understand the central tendency and spread of the variable (Descriptive Analysis) using:
   - Box plot
   - Histogram

### 2. Univariate Non-Graphical analysis: 

- Where is the center of the data? (location) --> **Measures of central tendency**
- How much does the data vary? (scale) --> **Measure of variability**
- What is the shape of the data? (shape) --> **Measures of variation combined with an average (measure of center) gives a good picture of the distribution of the data.**

## 4. Bivariate/Multivariate Analysis:

Here, you look at the relationships between two or more variables. This can involve looking for correlations, patterns, and trends that suggest a relationship or an association.

We can perform bi-variate analysis for any combination of categorical and numerical variables. The combination can be:
| bi-variate variables   | Plot type |
| ------------- | ------------- |
| Categorical & Categorical| Stacked Bar Chart |
| Categorical & numerical  | scatter plot, histogram, box plot|
| numerical  & numerical  | Scatter plot, line chart| 


Multivariate Analysis:
- Heat map
- Bar Chart
- Scatter Chart
- Line Chart

**Categorical & Categorical --> (Stacked Column Chart)**

**Categorical & numerical --> (scatter plot, histogram, box plot)**

**numerical & numerical --> (Scatter plot, line chart)**

We could also use a correlation matrix to get more specific information about the relationship between these two variables.