# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data

In [2]:
data = pd.read_excel('DataForTable2.1.xls', dtype=str) # was getting errors so imported as string

data.sample(5)

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
1002,Italy,2019,6.445416927337647,10.66286849975586,0.8384024500846863,71.9000015258789,0.7094788551330566,-0.0881169885396957,0.8655280470848083,0.5688936114311218,0.3279600739479065
373,Chad,2007,4.141326904296875,7.367519378662109,0.4789508581161499,47.459999084472656,0.294611781835556,-0.0170876756310462,0.8736096024513245,0.5975165367126465,0.2452083677053451
253,Botswana,2016,3.498936653137207,9.573143005371094,0.7683027982711792,53.07500076293945,0.8516948819160461,-0.2459289282560348,0.7291718125343323,0.6570554971694946,0.2518365979194641
1024,Jamaica,2019,6.309238910675049,9.234482765197754,0.8778144717216492,66.5999984741211,0.8906708359718323,-0.1458125114440918,0.8853300213813782,0.7217233777046204,0.1952841430902481
459,Congo (Brazzaville),2020,5.079139232635498,8.127007484436035,0.5965151190757751,56.525001525878906,0.7607275247573853,-0.0242626536637544,0.7277960181236267,0.5720909237861633,0.4346755743026733


## Notes on Data

[FAQ](https://worldhappiness.report/faq/)
- Life Ladder = Quality of life, scale of 1 - 10. 10 is best
- Countries are compared to Dystopia, a fictional country that serves as the least happy country and no country can be less happy that Dystopia

See [Appendix 1 from 2024](https://happiness-report.s3.amazonaws.com/2024/Ch2+Appendix.pdf) for more information about the variables if you need.
- Log GDP per capita - straightforward
- Social Support - Counting on someone when times are tough; national average of binary response
- Healthy life expectancy - straightforward
- Freedom to make life choices - national avg of binary response
- Generosity - residual of regressing national average of response to the GWP question “Have you donated money to a charity in the past month?” on GDP per capita.
- Perceptions of Corruption - average of the perception at the individual level
- Positive effect -  Positive affect is defined as the average of three positive affect measures in GWP: laugh, enjoyment and doing interesting things in the Gallup World Poll
- Negative affect - is defined as the average of three negative affect measures in GWP. They are worry, sadness and anger,

# Data Feeling and Cleaning

In [13]:
#check shape of data
data.shape

(2363, 11)

In [14]:
#check column names, nulls, and dtypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Country name                      2363 non-null   object
 1   year                              2363 non-null   object
 2   Life Ladder                       2363 non-null   object
 3   Log GDP per capita                2335 non-null   object
 4   Social support                    2350 non-null   object
 5   Healthy life expectancy at birth  2300 non-null   object
 6   Freedom to make life choices      2327 non-null   object
 7   Generosity                        2282 non-null   object
 8   Perceptions of corruption         2238 non-null   object
 9   Positive affect                   2339 non-null   object
 10  Negative affect                   2347 non-null   object
dtypes: object(11)
memory usage: 203.2+ KB


In [15]:
#number of unique values per features
data.nunique()

Country name                         165
year                                  19
Life Ladder                         2363
Log GDP per capita                  2334
Social support                      2350
Healthy life expectancy at birth    1126
Freedom to make life choices        2326
Generosity                          2282
Perceptions of corruption           2238
Positive affect                     2339
Negative affect                     2347
dtype: int64

In [16]:
#check for duplicates
data.duplicated().sum()

0

In [17]:
#number of nulls per feature
data.isna().sum()

Country name                          0
year                                  0
Life Ladder                           0
Log GDP per capita                   28
Social support                       13
Healthy life expectancy at birth     63
Freedom to make life choices         36
Generosity                           81
Perceptions of corruption           125
Positive affect                      24
Negative affect                      16
dtype: int64

In [20]:
#need to change datatype
for feature in data.columns:
    if feature != 'Country name':
        if feature == 'year':
            data[feature] = data[feature].astype(int)
        else:
            data[feature] = data[feature].astype(float)

#check change
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2363 non-null   object 
 1   year                              2363 non-null   int64  
 2   Life Ladder                       2363 non-null   float64
 3   Log GDP per capita                2335 non-null   float64
 4   Social support                    2350 non-null   float64
 5   Healthy life expectancy at birth  2300 non-null   float64
 6   Freedom to make life choices      2327 non-null   float64
 7   Generosity                        2282 non-null   float64
 8   Perceptions of corruption         2238 non-null   float64
 9   Positive affect                   2339 non-null   float64
 10  Negative affect                   2347 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 203.2+ KB


# Exploratory Data Analysis

In [37]:
nulls = data.isna().sum()
total_nulls = nulls.values.sum()
original_size = data.shape[0]

loss = (((original_size - total_nulls) / original_size) - 1) * 100
print(f"Removing the nulls results in a datset that is {loss} percent smaller than the original")

Removing the nulls results in a datset that is -16.33516716038934 percent smaller than the original


For the nulls, I could remove them, but I think I'm going to try and take the mean or average per feature on a country by country basis. Removing the nulls isn't all that bad considering we only lose about 16% of the data, but wouldn't hurt to compare.

In [41]:
#dealing with nulls in Log GDP per capita
#function to get average per country
def get_average(dataframe, feature):
    #groupby country and get mean
    my_mean = dataframe.groupby('Country name')[feature].mean()
    return my_mean

#get median
def get_median(dataframe, feature):
    my_median = dataframe.groupby('Country name')[feature].median()
    return my_median

#fill in nas
def fill_nulls(dataframe, feature, value=str):
    my_nulls = dataframe[dataframe[feature].isna()] ###fuuuuuuuuuuuuuck
    if value == 'mean':
        my_mean = get_average(dataframe, feature)
        dataframe[feature] = dataframe[feature].fillna(my_mean)
    elif value == 'median':
        my_median = get_median(dataframe, feature)
        dataframe[feature] = dataframe[feature].fillna(my_median)
    else:
        print("Please specify mean or median for the value parameter.")
    return dataframe



In [45]:
get_average(data, ['Log GDP per capita'])

Unnamed: 0_level_0,Log GDP per capita
Country name,Unnamed: 1_level_1
Afghanistan,7.585624
Albania,9.416875
Algeria,9.338236
Angola,8.985547
Argentina,10.028808
...,...
Venezuela,8.588995
Vietnam,8.977185
Yemen,7.925330
Zambia,8.058288


In [44]:
test = fill_nulls(data.copy(), 'Log GDP per capita', value='mean')
test[test['Log GDP per capita'].isna()]

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
13,Afghanistan,2022,1.281271,,0.228217,54.875,0.368377,,0.733198,0.205868,0.575512
14,Afghanistan,2023,1.445909,,0.368478,55.200001,0.228301,,0.738471,0.260513,0.460167
507,Cuba,2006,5.417869,,0.969595,68.0,0.281458,,,0.596187,0.276602
523,Cyprus,2023,6.070594,,0.802831,73.199997,0.72981,,0.839832,0.681513,0.296864
1100,Kosovo,2007,5.103906,,0.847812,,0.381364,,0.894462,0.613723,0.236699
1218,Libya,2023,5.970289,,0.748157,66.099998,0.762223,,0.643733,0.584836,0.372078
1326,Malta,2023,6.294855,,0.911656,71.699997,0.850815,,0.780031,0.643816,0.361336
1851,Singapore,2023,6.653942,,0.916326,74.0,0.861233,,0.152543,0.667157,0.190486
1886,Somaliland region,2009,4.9914,,0.879567,,0.746304,,0.513372,0.707874,0.112012
1887,Somaliland region,2010,4.657363,,0.829005,,0.820182,,0.471094,0.631947,0.083426
