## Table of Contents
<ul>
<li>01. Importing libraries</li>
<li>02. Importing data</li>
<li>03. Exploring data</li>
</ul>

## 01. Importing Libraries

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Importing data

In [5]:
path  = r'INSERT-YOUR-PATH'

In [6]:
#Import WHR as df_whr
df_whr = pd.read_csv(os.path.join(path, '01 Data', 'Original Data', 'WHR-orig.csv'), index_col = False)

In [7]:
df_whr

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.350,0.451,50.500,0.718,0.164,0.882,0.414,0.258
1,Afghanistan,2009,4.402,7.509,0.552,50.800,0.679,0.187,0.850,0.481,0.237
2,Afghanistan,2010,4.758,7.614,0.539,51.100,0.600,0.118,0.707,0.517,0.275
3,Afghanistan,2011,3.832,7.581,0.521,51.400,0.496,0.160,0.731,0.480,0.267
4,Afghanistan,2012,3.783,7.661,0.521,51.700,0.531,0.234,0.776,0.614,0.268
...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,2.694,7.698,0.759,53.100,0.632,-0.051,0.831,0.658,0.235
2359,Zimbabwe,2020,3.160,7.596,0.717,53.575,0.643,0.003,0.789,0.661,0.346
2360,Zimbabwe,2021,3.155,7.657,0.685,54.050,0.668,-0.079,0.757,0.610,0.242
2361,Zimbabwe,2022,3.296,7.670,0.666,54.525,0.652,-0.073,0.753,0.641,0.191


In [8]:
#Import EIU as df_EIU
df_eiu = pd.read_csv(os.path.join(path, '01 Data', 'Original Data', 'EIU-orig.csv'), index_col=False)

In [9]:
df_eiu

Unnamed: 0,Country,Year,Democracy
0,Afghanistan,2006,3.06
1,Afghanistan,2008,3.02
2,Afghanistan,2010,2.48
3,Afghanistan,2011,2.48
4,Afghanistan,2012,2.48
...,...,...,...
2500,Zimbabwe,2018,3.16
2501,Zimbabwe,2019,3.16
2502,Zimbabwe,2020,3.16
2503,Zimbabwe,2021,2.92


In [10]:
#Import Inflation Data as df_WDI
df_wdi = pd.read_csv(os.path.join(path, '01 Data', 'Original Data', 'WDI-orig.csv'), index_col=False)

In [11]:
df_wdi

Unnamed: 0,Country,Year,Inflation
0,Afghanistan,2008,26.418664
1,Afghanistan,2009,-6.811161
2,Afghanistan,2010,2.178538
3,Afghanistan,2011,11.804186
4,Afghanistan,2012,6.441213
...,...,...,...
3586,Zimbabwe,2018,10.618866
3587,Zimbabwe,2019,255.304991
3588,Zimbabwe,2020,557.201817
3589,Zimbabwe,2021,98.546105


## 03. Exploring data

### 03.01. Exploring WHR data and some quick fixes

In [14]:
df_whr.shape

(2363, 11)

In [15]:
df_whr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2363 non-null   object 
 1   year                              2363 non-null   int64  
 2   Life Ladder                       2363 non-null   float64
 3   Log GDP per capita                2335 non-null   float64
 4   Social support                    2350 non-null   float64
 5   Healthy life expectancy at birth  2300 non-null   float64
 6   Freedom to make life choices      2327 non-null   float64
 7   Generosity                        2282 non-null   float64
 8   Perceptions of corruption         2238 non-null   float64
 9   Positive affect                   2339 non-null   float64
 10  Negative affect                   2347 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 203.2+ KB


In [16]:
df_whr.dtypes

Country name                         object
year                                  int64
Life Ladder                         float64
Log GDP per capita                  float64
Social support                      float64
Healthy life expectancy at birth    float64
Freedom to make life choices        float64
Generosity                          float64
Perceptions of corruption           float64
Positive affect                     float64
Negative affect                     float64
dtype: object

In [17]:
# Check for mixed types
for col in df_whr.columns.tolist():
    weird = (df_whr[[col]].map(type) != df_whr[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_whr[weird]) > 0:
        print (col)

In [18]:
# Count NaNs in the entire DataFrame
nan_count_all = df_whr.isna().sum()
print("Number of NaNs in each column:")
print(nan_count_all)

Number of NaNs in each column:
Country name                          0
year                                  0
Life Ladder                           0
Log GDP per capita                   28
Social support                       13
Healthy life expectancy at birth     63
Freedom to make life choices         36
Generosity                           81
Perceptions of corruption           125
Positive affect                      24
Negative affect                      16
dtype: int64


In [19]:
#Renaming columns
df_whr.rename(columns = {'Country name' : 'Country'}, inplace = True)
df_whr.rename(columns = {'year' : 'Year'}, inplace = True)
df_whr.rename(columns = {'Life Ladder' : 'Happiness'}, inplace = True)
df_whr.rename(columns = {'Log GDP per capita' : 'GDP_log'}, inplace = True)
df_whr.rename(columns = {'Healthy life expectancy at birth' : 'Life_expectancy'}, inplace = True)
df_whr.rename(columns = {'Social support' : 'Social_support'}, inplace = True)
df_whr.rename(columns = {'Freedom to make life choices' : 'Freedom'}, inplace = True)
df_whr.rename(columns = {'Perceptions of corruption' : 'Corruption'}, inplace = True)
df_whr.rename(columns = {'Positive affect' : 'Pos_affect'}, inplace = True)
df_whr.rename(columns = {'Negative affect' : 'Neg_affect'}, inplace = True)

In [20]:
df_whr

Unnamed: 0,Country,Year,Happiness,GDP_log,Social_support,Life_expectancy,Freedom,Generosity,Corruption,Pos_affect,Neg_affect
0,Afghanistan,2008,3.724,7.350,0.451,50.500,0.718,0.164,0.882,0.414,0.258
1,Afghanistan,2009,4.402,7.509,0.552,50.800,0.679,0.187,0.850,0.481,0.237
2,Afghanistan,2010,4.758,7.614,0.539,51.100,0.600,0.118,0.707,0.517,0.275
3,Afghanistan,2011,3.832,7.581,0.521,51.400,0.496,0.160,0.731,0.480,0.267
4,Afghanistan,2012,3.783,7.661,0.521,51.700,0.531,0.234,0.776,0.614,0.268
...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,2.694,7.698,0.759,53.100,0.632,-0.051,0.831,0.658,0.235
2359,Zimbabwe,2020,3.160,7.596,0.717,53.575,0.643,0.003,0.789,0.661,0.346
2360,Zimbabwe,2021,3.155,7.657,0.685,54.050,0.668,-0.079,0.757,0.610,0.242
2361,Zimbabwe,2022,3.296,7.670,0.666,54.525,0.652,-0.073,0.753,0.641,0.191


In [21]:
df_whr.describe()

Unnamed: 0,Year,Happiness,GDP_log,Social_support,Life_expectancy,Freedom,Generosity,Corruption,Pos_affect,Neg_affect
count,2363.0,2363.0,2335.0,2350.0,2300.0,2327.0,2282.0,2238.0,2339.0,2347.0
mean,2014.76386,5.483566,9.399671,0.809369,63.401828,0.750282,9.8e-05,0.743971,0.651882,0.273151
std,5.059436,1.125522,1.152069,0.121212,6.842644,0.139357,0.161388,0.184865,0.10624,0.087131
min,2005.0,1.281,5.527,0.228,6.72,0.228,-0.34,0.035,0.179,0.083
25%,2011.0,4.647,8.5065,0.744,59.195,0.661,-0.112,0.687,0.572,0.209
50%,2015.0,5.449,9.503,0.8345,65.1,0.771,-0.022,0.7985,0.663,0.262
75%,2019.0,6.3235,10.3925,0.904,68.5525,0.862,0.09375,0.86775,0.737,0.326
max,2023.0,8.019,11.676,0.987,74.6,0.985,0.7,0.983,0.884,0.705


### 03.02. Exploring WDI data and some quick fixes

In [23]:
df_wdi.shape

(3591, 3)

In [24]:
df_wdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3591 entries, 0 to 3590
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    3591 non-null   object 
 1   Year       3591 non-null   int64  
 2   Inflation  3591 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 84.3+ KB


In [25]:
df_wdi.dtypes

Country       object
Year           int64
Inflation    float64
dtype: object

In [26]:
# Check for mixed types
for col in df_wdi.columns.tolist():
    weird = (df_wdi[[col]].map(type) != df_wdi[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_wdi[weird]) > 0:
        print (col)

In [27]:
# Count NaNs in the entire DataFrame
nan_count_all = df_wdi.isna().sum()
print("Number of NaNs in each column:")
print(nan_count_all)

Number of NaNs in each column:
Country      0
Year         0
Inflation    0
dtype: int64


In [28]:
df_wdi.describe()

Unnamed: 0,Year,Inflation
count,3591.0,3591.0
mean,2015.331106,5.949283
std,4.561047,17.171123
min,2008.0,-16.859691
25%,2011.0,1.642196
50%,2015.0,3.466926
75%,2019.0,6.378147
max,2023.0,557.201817


### 03.03. Exploring EIU data and some quick fixes

In [30]:
df_eiu.shape

(2505, 3)

In [31]:
df_eiu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505 entries, 0 to 2504
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    2505 non-null   object 
 1   Year       2505 non-null   int64  
 2   Democracy  2505 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 58.8+ KB


In [32]:
df_eiu.dtypes

Country       object
Year           int64
Democracy    float64
dtype: object

In [33]:
# Check for mixed data types
for col in df_eiu.columns.tolist():
    weird = (df_eiu[[col]].map(type) != df_eiu[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_eiu[weird]) > 0:
        print (col)

In [34]:
# Count NaNs in the entire DataFrame
nan_count_all = df_eiu.isna().sum()
print("Number of NaNs in each column:")
print(nan_count_all)

Number of NaNs in each column:
Country      0
Year         0
Democracy    0
dtype: int64


In [35]:
df_eiu.describe()

Unnamed: 0,Year,Democracy
count,2505.0,2505.0
mean,2014.8,5.46883
std,4.651377,2.2248
min,2006.0,0.32
25%,2011.0,3.46
50%,2015.0,5.77
75%,2019.0,7.24
max,2022.0,9.93


## 04. Exporting data

In [37]:
df_whr.to_csv(os.path.join(path, '01 Data', 'Prepared Data', 'WHR-2024-12-02.csv'))

In [38]:
df_wdi.to_csv(os.path.join(path, '01 Data', 'Prepared Data', 'WDI-2024-12-02.csv'))

In [39]:
df_eiu.to_csv(os.path.join(path, '01 Data', 'Prepared Data', 'EIU-2024-12-02.csv'))