In [4]:
!pip install geopandas
!pip install country_converter
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'colab'

from itables import show

# This stops a few warning messages from showing
pd.options.mode.chained_assignment = None 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Machine Learning Packages
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression 
from sklearn import metrics

import country_converter as coco

Collecting country_converter
  Downloading country_converter-1.3.2-py3-none-any.whl.metadata (25 kB)
Downloading country_converter-1.3.2-py3-none-any.whl (47 kB)
Installing collected packages: country_converter
Successfully installed country_converter-1.3.2


## Intro and Background ##

**Through this project, we are aiming to find out what exactly makes a country happy? To solve this we must take into account factors such as the economy, enviormental area, citizen's health, and etc. With our end goal being to find any correlation between these factors and a society's happiness scores, and to determine if any factors outweigh the other.**

In [None]:
df_2024 = pd.read_csv('./Data/2024.csv')
cc = coco.CountryConverter()
df_2024['ISO_A3'] = df_2024['Country name'].apply(lambda x: cc.convert(x, to='ISO3'))
df_2024 = df_2024.drop(['Regional indicator'], axis=1)
df_2024.head()

In [None]:
geojson_url = "https://datahub.io/core/geo-countries/r/countries.geojson"
gdf = gpd.read_file(geojson_url)
gdf = gdf.rename(columns = {'ISO3166-1-Alpha-3':'ISO_A3'})
merge_df = gdf.merge(df_2024, left_on='ISO_A3',right_on='ISO_A3')
merge_df = merge_df.drop(columns = 'ISO3166-1-Alpha-2')
merge_df.head()

In [None]:
df_obesity = pd.read_csv('./Data/obesity_2022.csv')
df_obesity = df_obesity.rename(columns={'GEO_NAME_SHORT':'name','RATE_PER_100_N':'Rate per 100 people'})
main_mask = ((df_obesity['DIM_TIME'] == 2020) & (df_obesity['DIM_SEX'] == 'TOTAL') & (df_obesity['name'] != 'World'))
df_obesity = df_obesity[main_mask]

final_mask = ['name','Rate per 100 people']

df_obesity = df_obesity[final_mask].reset_index(drop =True)
df_obesity.head()

In [None]:
obesity_merge_df = obesity_merge_df.rename(columns={'Country name':'name'})
obesity_merge_df = df_obesity.merge(merge_df, left_on='name',right_on='name')
obesity_merge_df = gpd.GeoDataFrame(obesity_merge_df, geometry='geometry')
obesity_merge_df.head()

## Exploratory Data Analysis ##

In [None]:
df_2024.shape

In [None]:
df_2024 = df_2024.sort_values(by='Ladder score', ascending = False).reset_index(drop=True)
df_2024

## Proposed Question ##

In [None]:
## Plots a geographical map, colored by world happiness scores
fig = px.choropleth(
    merge_df,
    geojson=merge_df.geometry,
    locations=merge_df.index,
    color="Ladder score",        
    hover_name="Country name",      
    color_continuous_scale="RdYlBu",
    title="World Happiness Scores"
)

fig.update_layout(
    margin={"r":0,"l":0,"b":0},
    geo=dict(showframe=False, showcoastlines=False)
)

fig.show(config={
    'scrollZoom': False,          
    'displayModeBar': False       
})

From this plot, I can infer that countries in Africa and the Middle East tend to have lower happiness scores. This could be due to lack of resources, political oppression, and ramifications of warfare.

In [None]:
## Plots a geographical map, colored by obesity rate per country
fig_2 = px.choropleth(
    obesity_merge_df,
    geojson=obesity_merge_df.geometry.__geo_interface__,
    locations=obesity_merge_df.index,
    color="Rate per 100 people",        
    hover_name="Country name",      
    color_continuous_scale="Inferno",
    title="Obesity Rate for Various Countries (How many people out of 100 for that country are obese?)"
)

fig_2.update_layout(
    margin={"r":0,"l":0,"b":0},
    geo=dict(showframe=False, showcoastlines=False)
)

fig_2.show(config={
    'scrollZoom': False,          
    'displayModeBar': False       
})

Countries that have a low obesity rate (potentially due to starvation but this is just an inference) tend to also be the countries wwith the lower percentile of happiness scores, is there a correlation?

In [None]:
mask = ['Healthy life expectancy','Rate per 100 people', 'Ladder score']
df_model = obesity_merge_df.copy()
df_model = df_model[mask]
df_model.dtypes

In [None]:
features = ['Healthy life expectancy','Rate per 100 people']
X = df_model[features].values.reshape(-1,2)
y = (df_model['Ladder score'])

In [None]:
LM = LinearRegression()
LM.fit(X,y)

In [None]:
LM.coef_

In [None]:
LM.intercept_

In [None]:
LM.score(X,y)

In [None]:
x_values = 'Rate per 100 people' 
y_values = 'Healthy life expectancy'  
z_values = 'Ladder score' 

prediction_df = df_model.copy()
prediction_df['Predicted_Score'] = LM.predict(df_model[[x_values, y_values]].values)
prediction_df['Error'] = prediction_df[z_values] - prediction_df['Predicted_Score']

prediction_df.head()

In [None]:
x_values = 'Rate per 100 people' 
y_values = 'Healthy life expectancy'  
z_values = 'Ladder score'        

x_range = np.linspace(df_model[x_values].min(), df_model[x_values].max(), 50)
y_range = np.linspace(df_model[y_values].min(), df_model[y_values].max(), 50)
x_grid, y_grid = np.meshgrid(x_range, y_range)

z_grid = (LM.intercept_ + 
          (LM.coef_[0] * x_grid) + 
          (LM.coef_[1] * y_grid))

fig_3 = px.scatter_3d(df_model, 
                    x=x_values, 
                    y=y_values, 
                    z=z_values,
                    opacity=0.7,
                    title="Happiness: Actual vs Predicted"
                     )

fig_3.add_traces(go.Surface(
    x=x_range, 
    y=y_range, 
    z=z_grid, 
    name='Prediction Plane',
    colorscale='Reds',
    showscale=False, 
    opacity=0.4
))

fig_3.add_traces(go.Scatter3d(
    x=prediction_df[x_values],
    y=prediction_df[y_values],
    z=prediction_df['Predicted_Score'], # Note: We use the PREDICTED Z here
    mode='markers',
    marker=dict(size=3, color='red', symbol='x'),
    name='Model Prediction'
))

for i in range(len(prediction_df)):
    fig_3.add_traces(go.Scatter3d(
        x=[prediction_df[x_values].iloc[i], prediction_df[x_values].iloc[i]],
        y=[prediction_df[y_values].iloc[i], prediction_df[y_values].iloc[i]],
        z=[prediction_df[z_values].iloc[i], prediction_df['Predicted_Score'].iloc[i]],
        mode='lines',
        line=dict(color='gray', width=2),
        showlegend=False
    ))

fig_3.show()

In [None]:
(prediction_df['Error'] > -25).value_counts()

From the falicies found in my linear regression model, I hypothesis that the reason for my inaccuracy when predicting can be attributed to the fact that happiness scores are infinitely times more broad than just health, maybe adding more features will give me better predictions.

### END OF HEALTH SECTION (ELIAS) ###

In [None]:
file_location = '../Final_project/Main Data/2019.csv'
file_name = '../Final_project/Main Data/iceland_benefits.xlsx'
file_name2 = '../Final_project/Main Data/iceland_income_support.xlsx'
file_name3 = '../Final_project/Main Data/GDP%.xlsx'
file_name4 = '../Final_project/Main Data/social spending.csv'
DF_SS = pd.read_csv(file_name4)
DF_GDP = pd.read_excel(file_name3)
DF_inc = pd.read_excel(file_name2)
DF_ben = pd.read_excel(file_name)
DF = pd.read_csv(file_location)

In [None]:
DF.sort_values('Social support', ascending=False).head(10)

In [None]:
mask = DF_SS['Year'] == 2019
DF_SS[mask].sort_values(by='Public social expenditure as a share of GDP')

A strong wellfare program is usually a good indicator of a healthy country, a country that has a stable enough economy to afford a supportive federal program to help their population. The DF_SS dataset, according to the website it was pulled from, looks at among others, health, old age, incapacity-related benefits, family, active labor market programmes, unemployment, and housing. A few notable countries that we want to look at for the top of the Social support category from the hapiness dataset and the following datasets are Findland, Denmark, and Norway

In [None]:
col = ['Country Name','Indicator Name',2019]
m = DF_GDP[2019].notnull()
DF_GDP = DF_GDP[col][m]
DF_GDP.sort_values(2019, ascending=False).tail(11)

In [None]:
DF.sort_values('Social support', ascending=False).tail(10)

We made a few assumptions going into this project, namely that Social Support and GDP per Capita were big contributors to hapiness in a country. Our reasoning is that the more financial support and access to wellfare a population has, the less chance of falling to poverty. Lower scores on the %GDP dataset mean that the country in question spends that value as a percentage of their GDP towards domestic general government health expenditures, otherwise known as healthcare. As we can see in the DF_GDP dataset, some of the lowest values coincide with some of the lowest Social Support scores from the world hapiness dataset. Mainly Chad, Haiti, Afghanistan, and Benin.

In [None]:
DF_temp = pd.merge(DF, DF_SS[mask], 
                  left_on='Country or region', 
                  right_on='Entity', 
                  how="left")

DF_new = pd.merge(DF_temp, DF_GDP[m],
                 left_on='Country or region',  # Adjust this column name if needed
                 right_on='Country Name',
                 how="left")

In [None]:
DF_new1 = DF_new.sort_values(by=2019)
mask1 = DF_new1['Social support'] >= 1
fig = px.scatter(DF_new1[mask1],
                 x='Social support',
                 y='Public social expenditure as a share of GDP',
                 color=2019,
                 hover_data='Country or region',
                trendline='ols')
fig.update_layout(
    xaxis_title='Country Score "Social Support"',
    yaxis_title='Social Expenditure as %GDP',
    coloraxis_colorbar_title_text='General Health Expenditure %GDP')
fig.show()

In [None]:
DF_new1 = DF_new.sort_values(by=2019)
mask1 = DF_new1['Social support'] >= 1
fig = px.scatter(DF_new1[mask1],
                 x='Social support',
                 y='Public social expenditure as a share of GDP',
                 color=2019,
                 hover_data='Country or region',
                trendline='ols')
fig.update_layout(
    xaxis_title='Country Score "Social Support"',
    yaxis_title='Social Expenditure as %GDP',
    coloraxis_colorbar_title_text='General Health Expenditure %GDP')
fig.show()

Now to explain some outliers, these numbers are percentage based of GDP, meaning smaller economies that spend a relative equal amount as bigger countries will have higher scores. For instance, Greece has a much smaller population than a lot of other countries, but they spend a large portion of their GDP on wellfare and social support. High scores here don't always equate to hapiness, as Greece shows, but as we can see there is a general positive relationship between a government's social expenditure, health expenditure, and how much a population believes they recieve support from their government.