# This script contains the following points:


### 01. Import libraries
### 02. Import data
### 03. Basic data cleaning and consistency checks

# 01. Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Import data

In [13]:
# Set path

path = r'C:\Users\nang6\OneDrive\Bureau\Data Analytics\Data Immersion\Achievement 6'

In [14]:
# Import data set

df = pd.read_csv(os.path.join(path, 'Open Data Source', 'Raw Data', 'WHR 2008-2022.csv'), index_col = False)

# 03. Basic data cleaning and consistency checks

In [15]:
# Check first 5 rows

df.head()

Unnamed: 0,Country name,Region,Year,Life ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,Southern Asia,2008,3.724,7.35,0.451,50.5,0.718,0.168,0.882,0.414,0.258
1,Afghanistan,Southern Asia,2009,4.402,7.509,0.552,50.8,0.679,0.191,0.85,0.481,0.237
2,Afghanistan,Southern Asia,2010,4.758,7.614,0.539,51.1,0.6,0.121,0.707,0.517,0.275
3,Afghanistan,Southern Asia,2011,3.832,7.581,0.521,51.4,0.496,0.164,0.731,0.48,0.267
4,Afghanistan,Southern Asia,2012,3.783,7.661,0.521,51.7,0.531,0.238,0.776,0.614,0.268


In [16]:
# Check last 5 rows

df.tail()

Unnamed: 0,Country name,Region,Year,Life ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
2194,Zimbabwe,Sub-Saharan Africa,2018,3.616,7.783,0.775,52.625,0.763,-0.051,0.844,0.658,0.212
2195,Zimbabwe,Sub-Saharan Africa,2019,2.694,7.698,0.759,53.1,0.632,-0.047,0.831,0.658,0.235
2196,Zimbabwe,Sub-Saharan Africa,2020,3.16,7.596,0.717,53.575,0.643,0.006,0.789,0.661,0.346
2197,Zimbabwe,Sub-Saharan Africa,2021,3.155,7.657,0.685,54.05,0.668,-0.076,0.757,0.61,0.242
2198,Zimbabwe,Sub-Saharan Africa,2022,3.296,7.67,0.666,54.525,0.652,-0.07,0.753,0.641,0.191


### Renaming Columns:

In [17]:
# Check rows and columns

df.shape

(2199, 12)

In [18]:
df.columns

Index(['Country name', 'Region', 'Year', 'Life ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect'],
      dtype='object')

In [19]:
# Rename columns for shorter and more concise format

df.rename(columns = {'Country name' : 'Country',
                     'Life ladder': 'Happiness Score', 'Log GDP per capita': 'GDP per Capita', 
                     'Social support': 'Social Support', 'Healthy life expectancy at birth': 'Life Expectancy',
                     'Freedom to make life choices': 'Freedom', 'Generosity': 'Charity',
                     'Perceptions of corruption': 'Corruption Perception', 'Positive affect': 'Positive Emotions',
                     'Negative affect': 'Negative Emotions'}, inplace = True)

In [20]:
# Check for column names and data types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                2199 non-null   object 
 1   Region                 2199 non-null   object 
 2   Year                   2199 non-null   int64  
 3   Happiness Score        2199 non-null   float64
 4   GDP per Capita         2179 non-null   float64
 5   Social Support         2186 non-null   float64
 6   Life Expectancy        2145 non-null   float64
 7   Freedom                2166 non-null   float64
 8   Charity                2126 non-null   float64
 9   Corruption Perception  2083 non-null   float64
 10  Positive Emotions      2175 non-null   float64
 11  Negative Emotions      2183 non-null   float64
dtypes: float64(9), int64(1), object(2)
memory usage: 206.3+ KB


### Observation:

We can already see that there are missing values based on the Non-Null Count

In [21]:
# Check for missing values

df.isnull().sum()

Country                    0
Region                     0
Year                       0
Happiness Score            0
GDP per Capita            20
Social Support            13
Life Expectancy           54
Freedom                   33
Charity                   73
Corruption Perception    116
Positive Emotions         24
Negative Emotions         16
dtype: int64

#### Observation:

I will leave those missing values here for now as depending on the analysis, I will therefore decide if I'd use the mean or delete those missing values.

In [29]:
# Check for duplicates

df_dups = df[df.duplicated()]

In [25]:
df_dups.shape # No duplicates!

(0, 12)

In [28]:
# Check for mixed-type data

for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

 # No mixed-type columns either!   

In [30]:
# Descriptive statistics

df.describe()

Unnamed: 0,Year,Happiness Score,GDP per Capita,Social Support,Life Expectancy,Freedom,Charity,Corruption Perception,Positive Emotions,Negative Emotions
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479227,9.38976,0.810681,63.294582,0.747847,9.1e-05,0.745208,0.652148,0.271493
std,4.718736,1.125527,1.153402,0.120953,6.901104,0.140137,0.161079,0.185835,0.105913,0.086872
min,2005.0,1.281,5.527,0.228,6.72,0.258,-0.338,0.035,0.179,0.083
25%,2010.0,4.647,8.5,0.747,59.12,0.65625,-0.112,0.688,0.572,0.208
50%,2014.0,5.432,9.499,0.836,65.05,0.77,-0.023,0.8,0.663,0.261
75%,2018.0,6.3095,10.3735,0.905,68.5,0.859,0.092,0.869,0.738,0.323
max,2022.0,8.019,11.664,0.987,74.475,0.985,0.703,0.983,0.884,0.705


### Observation:

Everything seems ok

In [31]:
# Export

df.to_csv(os.path.join(path, 'Open Data Source','Cleaned Data', 'WHR (2008-2022)_checked.csv'))