In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from yellowbrick.regressor import ResidualsPlot
import statsmodels.formula.api as sm

#
Load and Transformation

In [2]:
# File from OECD https://homl.info/4
better_life = pd.read_csv("better_life_index_filtered.csv")

In [3]:
# File from IMF https://homl.info/5
gdp = pd.read_csv("gdp.csv", sep=";")

In [4]:
# replace , for .
gdp["USD_Billions"] = gdp["USD_Billions"].apply(
  lambda x: re.sub(",", ".", x)
)

# replace 'no data' for 0
gdp["USD_Billions"] = gdp["USD_Billions"].apply(
  lambda x: 0 if x == 'no data' else x
)

# convert to numeric
gdp["USD_Billions"] =  pd.to_numeric(
  gdp["USD_Billions"]
)

In [5]:
# File from IMF https://homl.info/5
gdp_per_capita = pd.read_csv("gdp_per_capita.csv", sep=";")

In [6]:
# replace , for .
gdp_per_capita["USD_Thousand"] = gdp_per_capita["USD_Thousand"].apply(
  lambda x: re.sub(",", ".", x)
)

# replace 'no data' for 0
gdp_per_capita["USD_Thousand"] = gdp_per_capita["USD_Thousand"].apply(
  lambda x: 0 if x == 'no data' else x
)

# convert to numeric
gdp_per_capita["USD_Thousand"] =  pd.to_numeric(
  gdp_per_capita["USD_Thousand"]
)

# transform to thousands
gdp_per_capita["USD_Thousand"] = gdp_per_capita["USD_Thousand"].apply(
  lambda x: round(x/1000, 3)
)

In [7]:
better_life = better_life.drop(
  columns=[
    "STRUCTURE", "STRUCTURE_NAME", "ACTION", "STRUCTURE_ID", "MEASURE",
    "Measure", "LOCATION", "Inequality", "Observation Value",
    "Observation Status", "Base reference period", "BASE_PER", "Multiplier",
    "UNIT_MEASURE", "OBS_STATUS", "UNIT_MULT", "INDICATOR"
    ]
  )

#
Life Satisfaction

In [8]:
# Filter by Indicator = 'Life satisfaction' and Inequality = 'Total'
life_satisfaction = better_life[
  (better_life["Indicator"] == "Life satisfaction") &
  (better_life["INEQUALITY"] == "TOT")
]

# Drop unused columns
life_satisfaction = life_satisfaction.drop(
  columns=["Indicator", "Unit of Measures", "INEQUALITY"]
)

# Rename columns
life_satisfaction.columns = ["Country", "Satisfaction_Score"]

In [9]:
# Merge dataframe gdp and gdp_per_capita
df = gdp.merge(
  gdp_per_capita,
  how="inner",
  on="Country"
)

# rename colummns
df.columns = ["Country", "GDP_USD_Billion", "GDP_PerCapita_USD_Thousand"]

In [10]:
# merge dataframe with life_satisfaction
df = df.merge(
  life_satisfaction,
  how="inner",
  on="Country"
)

In [11]:
# order by 'Satisfaction Score'
df = df.sort_values(by="Satisfaction_Score", ascending=False)

In [12]:
x1 = df["GDP_PerCapita_USD_Thousand"].values
x2 = df["GDP_USD_Billion"].values
y = df["Satisfaction_Score"].values

In [13]:
# Correlation between GDP per Capita and Satisfaction is much bigger than GDP
correlacao1 = np.corrcoef(x1,y)
correlacao1 = round(correlacao1[1][0], 3)
correlacao2 = np.corrcoef(x2, y)
correlacao2 = round(correlacao2[1][0], 3)

In [None]:
print(f"\
  Correlation between GDP per Capita and Satisfaction Score: {correlacao1}\n\
  Correlation between GDP and Satisfaction Score {correlacao2}\
")

In [None]:
# reshape to matrix format
x1 = x1.reshape(-1, 1)

# Create model and training
modelo = LinearRegression()
modelo.fit(x1, y)

In [None]:
print(f"\
  Intercept:{round(modelo.intercept_, 3)} \n\
  Coeficiente: {round(modelo.coef_[0], 3)}")

In [17]:
# Country with highest GDP Per Capita
first_gdp_per_capita = df.sort_values(
  by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[0,0]

# Position on chart
first_gdp_per_capita_position = (
  df.sort_values(by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[0,2],
  df.sort_values(by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[0,3]
)

# Country with second highest GDP Per Capita
second_gdp_per_capita = df.sort_values(
  by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[1,0]
# Position on chart
second_gdp_per_capita_position = (
  df.sort_values(by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[1,2],
  df.sort_values(by="GDP_PerCapita_USD_Thousand", ascending=False).iloc[1,3]
)

# Country with lowest GDP Per Capita
last_gdp_per_capita = df.nsmallest(1, "GDP_PerCapita_USD_Thousand").iloc[0,0]
# Position on chart
last_gdp_per_capita_position = (
  df.nsmallest(1, "GDP_PerCapita_USD_Thousand").iloc[0,2],
  df.nsmallest(1, "GDP_PerCapita_USD_Thousand").iloc[0,3]
)

In [None]:
# Plot
titlefont1 = {'family':'serif','color':'blue','size':16, 'weight':'bold'}

plt.scatter(x1,y, )
plt.plot(x1, modelo.predict(x1), color="red")
plt.xlabel(
  "GDP Per Capita - $ Thousands",
  fontdict={"fontsize":11, "weight":'bold'}
)
plt.ylabel(
  "Satisfaction Score",
  fontdict={"fontsize":11, "weight":'bold'}
)
plt.title(("Does money bring happiness?").upper(), fontdict=titlefont1, y=1.05)

plt.text(
  x=15,
  y=8.1,
  s=f"Correlation: {round(correlacao1, 3)}",
  fontsize=12,
  bbox=dict(facecolor="white", alpha=1)
)

plt.annotate(
  first_gdp_per_capita,
  xy=([x-0.1 for x in first_gdp_per_capita_position]),
  fontsize=10,
  xytext=(136,6.9),
  arrowprops=dict(facecolor='red'),
  color='b'
)

plt.annotate(
  second_gdp_per_capita,
  xy=([x-0.1 for x in second_gdp_per_capita_position]),
  fontsize=10,
  xytext=(122,6.5),
  arrowprops=dict(facecolor='red'),
  color='b'
)

plt.annotate(
  last_gdp_per_capita,
  xy=(18, 4.9),
  fontsize=10,
  xytext=(26,4.85),
  arrowprops=dict(facecolor='red'),
  color='b'
)
plt.show()

In [None]:
# Residuals plot
residuals = ResidualsPlot(modelo)
residuals.fit(x1,y)
residuals.poof()
plt.show()

#
Create a dataframe for each 'Indicator'

In [20]:
# Create a dataframe with variable and respective indicator
indicator = pd.DataFrame(
  zip(
    better_life["Indicator"].str.lower().str.replace(" ", "_").unique(),
    better_life["Indicator"].unique()
  ),
  columns=["Variable", "Indicator"]
)

In [None]:
df = better_life.drop(
  columns=[
    "STRUCTURE", "STRUCTURE_NAME", "ACTION", "STRUCTURE_ID", "MEASURE",
    "Measure", "LOCATION", "INEQUALITY", "Observation Value",
    "Observation Status", "Base reference period", "BASE_PER", "Multiplier",
    "UNIT_MEASURE", "OBS_STATUS", "UNIT_MULT", "INDICATOR"
    ]
  )

In [17]:
# Create a dataframe for each indicator
for a in range(len(indicator)):
  locals()[
    indicator["Variable"][a]
    ] = df[better_life["Indicator"] == indicator["Indicator"][a]]

In [None]:
for i in indicator["Variable"]:
  print(i)