# Extracting data about level of happiness in different countries

# Link

World Happines Report 2023:
https://worldhappiness.report/data/

## Imports


In [4]:
import pandas as pd
import numpy as np

# 1) Extracting data from csv file 

In [6]:
df = pd.read_excel("All_data_files/data_happiness_2023.xls")

# 2) Initial exploration of data (for cleaning and transformation)

I'm exlporing the data with the intend of making sure it fits a certain structure and thereby easily can be merged with other datae

In [9]:
df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [10]:
df.describe()

Unnamed: 0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479226,9.389766,0.810679,63.294583,0.747858,9.6e-05,0.745195,0.652143,0.271501
std,4.718736,1.125529,1.153387,0.120952,6.901104,0.14015,0.161083,0.185837,0.105922,0.086875
min,2005.0,1.281271,5.526723,0.228217,6.72,0.257534,-0.337527,0.035198,0.178886,0.082737
25%,2010.0,4.64675,8.499764,0.746609,59.119999,0.656528,-0.112116,0.688139,0.571684,0.20766
50%,2014.0,5.432437,9.498955,0.835535,65.050003,0.769821,-0.022671,0.799654,0.663063,0.260671
75%,2018.0,6.30946,10.373216,0.904792,68.5,0.859382,0.09207,0.868827,0.737936,0.322894
max,2022.0,8.018934,11.663788,0.987343,74.474998,0.985178,0.702708,0.983276,0.883586,0.70459


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


Checking what years are included to see if any years from 2005 to 2022 aren't included

In [13]:
all_years_df = df["year"].unique()
all_years_df.sort()
print(all_years_df)

[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022]


### Missing values

In [15]:
df.isna().sum()

Country name                          0
year                                  0
Life Ladder                           0
Log GDP per capita                   20
Social support                       13
Healthy life expectancy at birth     54
Freedom to make life choices         33
Generosity                           73
Perceptions of corruption           116
Positive affect                      24
Negative affect                      16
dtype: int64

### Duplicates

In [17]:
df.duplicated().sum()

0

In [18]:
#df.drop_duplicates(inplace=True)

# 3) Transforming and cleaning data

### 3.1) Removing null/na/missing values

In [21]:
df.dropna(inplace=True)
df.isna().sum()

Country name                        0
year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
dtype: int64

### 3.2) Renaming columns 

In [23]:
df.rename(columns={"year": "Year", "Country name": "Country"}, inplace=True)

In [24]:
df.head()

Unnamed: 0,Country,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1958 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           1958 non-null   object 
 1   Year                              1958 non-null   int64  
 2   Life Ladder                       1958 non-null   float64
 3   Log GDP per capita                1958 non-null   float64
 4   Social support                    1958 non-null   float64
 5   Healthy life expectancy at birth  1958 non-null   float64
 6   Freedom to make life choices      1958 non-null   float64
 7   Generosity                        1958 non-null   float64
 8   Perceptions of corruption         1958 non-null   float64
 9   Positive affect                   1958 non-null   float64
 10  Negative affect                   1958 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 183.6+ KB


# 4) Storing data frame in file 

In [27]:
with pd.HDFStore('dataframes.h5') as store:
    store['happiness_df'] = df

SyntaxError: invalid syntax (791285630.py, line 1)