In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Just run on 1st instance on Local machine
!pip install plotly plotly-geo
!pip install pycountry_convert

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
#import matplotlib.pyplot as plt
#import seaborn as sns
#import squarify
import pycountry_convert as pc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Default theme & color code I will be using
#plt.style.use('seaborn-dark')
#theme = 'plotly_dark'
green = '#009473'
red = '#dd4124'

# Happiness Index

We'll be doing pure **Explanatory data analysis** to extract as much as information as possible.
We'll also import Population databse to see any correlation between Happiness Index data & Population.
Since data is already cleaned, there isn't much to do except renaming few columns & dropping few. 

I'll be using only Plotly library to visualize the data & hope to learn the library while doing this analysis.

In [None]:
#### Importing files & copying it
happiness2021 = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report-2021.csv')
happiness = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report.csv')
pop = pd.read_csv('../input/world-bank-data-1960-to-2016-extended/population-by-country-2020/population_by_country_2020.csv')

- Making a copy of data.

In [None]:
df1 = happiness2021.copy()
df2 = happiness.copy()

## Let's take a overlook at the data & Fix few columns

In [None]:
df1.head(3)

- We need to fix few column names

In [None]:
df1.rename(columns={'Country name': 'Country',
                    "Healthy life expectancy":"Life Expectancy",
                    "Freedom to make life choices":"Freedom",
                    "Perceptions of corruption":"Corruption",
                   'Logged GDP per capita': 'GDP per capita'},inplace=True)

- We don't need few columns, dropping them for now

In [None]:
df1.drop(['Explained by: Log GDP per capita','Explained by: Social support','Explained by: Healthy life expectancy','Ladder score in Dystopia',
          'Explained by: Freedom to make life choices','Explained by: Generosity','Explained by: Perceptions of corruption','Dystopia + residual',
         'Standard error of ladder score','upperwhisker','lowerwhisker'], axis=1,inplace=True)

In [None]:
df2.head(3)

 - df2 seems cleaned mostly, but i can't make sense of few columns

In [None]:
df2.rename(columns={'Country name': 'Country',
                    'Healthy life expectancy at birth': 'Life Expectancy',
                    'Freedom to make life choices': 'Freedom',
                    'Perceptions of corruption': 'Corruption',
                   'Log GDP per capita': 'GDP per capita','Life Ladder':'Ladder score'},inplace=True)
df2.drop(['Positive affect','Negative affect'],axis=1,inplace=True)

In [None]:
pop.head(3)

In [None]:
pop.rename(columns={'Country (or dependency)': 'Country'},inplace=True)

## Happiest & Unhappiest Countries in 2021

We are going to take a look at Top 6 & Bottom 6 Countries in the List. Happiness is measured through the Ladder score.

In [None]:
t_6=df1.head(6)
b_6=df1.tail(6)

In [None]:
fig = make_subplots(rows=1, cols=2,
                   column_width=[0.5,0.5],
                   subplot_titles=['Top 6 Countries in Index', 'Bottom 6 Countries in Index'])
fig.append_trace(go.Bar(x=t_6['Ladder score'],
                       y=t_6['Country'],
                       orientation='h',
                       marker={'color': green,'line': dict(color=green, width=1)},
                       name='',
                       text=t_6['Regional indicator']
                       ), 1,1
                )
fig.append_trace(go.Bar(x=b_6['Ladder score'],
                       y=b_6['Country'],
                        orientation='h',
                        marker={'color': red,'line': dict(color=red, width=1)},
                        name='',
                        text=b_6['Regional indicator']
                       ), 1,2
                )
fig.update_layout(
    #template='plotly_dark',
    showlegend=False,
    title_text='Overview of World Happiness Index 2021',
    title_font_size=24)
fig.update_annotations(yshift=5) # Shifts Title of subplot little above figure
fig.show()

## Let's see Region wise Happiness Score
### Let's see Ladder Score values in a Boxplot & get mean, median

In [None]:
fig = px.box(data_frame=df1,
            x='Ladder score',orientation='h',width=800,height=300,
            labels={'Ladder score': 'Happiness Score'})
fig.update_layout(
    #template='plotly_dark',
    title_text='Box Plot of Ladder Score',
    title_font_size=24)
fig.show()
mean_lad=df1['Ladder score'].mean()
median_lad=df1['Ladder score'].median()
print('mean=',mean_lad)
print('median=',median_lad)

## Region wise Comparision using mean

In [None]:
region_score = df1.groupby('Regional indicator')['Life Expectancy',
                                                 'GDP per capita','Corruption',
                                                 'Freedom','Ladder score'].mean().round(decimals=2).sort_values('Ladder score').reset_index()

### Let's see the Table

In [None]:
region_score

In [None]:
color_dict = []
for i in region_score['Ladder score']:
    if i>df1['Ladder score'].mean():
        color_dict.append(green)
    else:
        color_dict.append(red)
fig = go.Figure()
fig.add_trace(go.Bar(x=region_score['Ladder score'],
                     y=region_score['Regional indicator'],
                     orientation='h',
                     marker=dict(
                     color=color_dict,
                     line=dict(color=color_dict, width=1)
                )))
fig.update_layout(title_text='Mean Ladder score In different Region',
                  title_font_size=22)
fig.show()

In [None]:
fig = px.scatter(region_score,
    x="Life Expectancy", y="Ladder score", size="GDP per capita", color="Regional indicator",
    labels={"Ladder score": 'Happiness Score'},size_max=7,
    title="Ladder Score vs Life Expectancy vs GDP per capita in various regions:<br>(Size of Bubble describes GDP per Capita)",
                 opacity=0.7,hover_name='Regional indicator',hover_data={"Regional indicator":False})
fig.update_traces(marker_sizemode='diameter')
fig.show()

## Region wise Comparision in different Countries

### Let's First see World Map

In [None]:
df_country_code = df1.copy()

#ADAPTING TO THE ISO 3166 STANDARD
df_country_code.loc[df_country_code['Country'] == 'Taiwan Province of China', 'Country'] = 'Taiwan, Province of China' 
df_country_code.loc[df_country_code['Country'] == 'Hong Kong S.A.R. of China', 'Country'] = 'Hong Kong' 
df_country_code.loc[df_country_code['Country'] == 'Congo (Brazzaville)','Country'] = 'Congo' 
df_country_code.loc[df_country_code['Country'] == 'Palestinian Territories','Country'] = 'Palestine, State of' 

df_country_code.drop(index=df_country_code[df_country_code['Country'] == 'Kosovo'].index, inplace=True) # Kosovo Code agreed on not to use by ISO 3166
df_country_code.drop(index=df_country_code[df_country_code['Country'] == 'North Cyprus'].index, inplace=True) # Not part of the ISO 3166 standard

# Making coulm with country code
df_country_code['c_code']=df_country_code['Country'].apply(lambda x: pc.country_name_to_country_alpha3(x,))
col_code=[]
for i in df_country_code['Ladder score']:
    if i>=median_lad:
        col_code.append(red)
    else:
        col_code.append(green)

In [None]:
# Making the figure
fig = px.choropleth(df_country_code, locations='c_code',
                    color='Ladder score',
                    hover_name='Country',
                    hover_data={'c_code':False,
                               'GDP per capita': True,
                               'Life Expectancy': True,
                               'Freedom': True},
                    color_continuous_scale=col_code,
                    labels={'Ladder score': 'Happiness Score'},
                    title='World Map View'
                   )
fig.update_traces(showlegend=False)
fig.show()

### Happiness Score vs Life Expetency vs GDP per capita

In [None]:
fig = px.scatter(df1,
    x="Life Expectancy", y="Ladder score", size="GDP per capita", color="Regional indicator",
    labels={"Ladder score": 'Happiness Score'},size_max=7,
    title="Ladder Score vs Life Expectancy vs GDP per capita in various Countries: \
                 <br>Size of Bubble describes GDP per Capita",
                 hover_name='Country',
                 opacity=0.7)
fig.update_traces(marker_sizemode='diameter')
fig.show()

We can see that Most Western Europe has Higher Life Expectancy with higher Happiness Index. While Sub-Saharan Africa has comparatively lower Life Expectancy & lower Happiness Index Score. Commonwealth of Independent States are clustered mostly in the middle.

The Scatterplot seems linear, We can assume countries with Happier people tend to live longer.

### Happiness index vs Corruption vs Freedom in different Region

In [None]:

fig = px.scatter(df1,
    x="Freedom", y='Corruption', size="Ladder score", color="Regional indicator",
    labels={"Ladder score": 'Happiness index Score'},size_max=7,
    title="Freedom vs Corruption vs Happiness Score in different Countries<br> \
                 Size of Bubble propertional to Happiness Score",
                 opacity=0.7,hover_name='Country')
fig.update_traces(marker_sizemode='diameter')
fig.show()

We can see that most points are in upper right plot.
It explains an inverse relation between Corruption & Freedom.

## Population
Let's add population data into account for further study

In [None]:
df1pop = pd.merge(df1, pop, how='inner', on='Country')
df1pop.head()

In [None]:
df1pop.info()

We can see columns like 'Urban Pop %', 'World Share', 'Med. Age', 'Fert. Rate' etc are object while it should be int or float.
We'll have to fix them.

In [None]:
# Let's see inside those objects
print(df1pop['Urban Pop %'].value_counts(dropna=False))
print(df1pop['World Share'].value_counts(dropna=False))
print(df1pop['Population (2020)'].value_counts(dropna=False))
print(df1pop['Yearly Change'].value_counts(dropna=False))
print(df1pop['Med. Age'].value_counts(dropna=False))
print(df1pop['Fert. Rate'].value_counts(dropna=False))

We have to fix objects with NA values & string values to proper datatype

In [None]:
# Fix those % & NA
df1pop = df1pop[df1pop['Urban Pop %'] != 'N.A.']
df1pop['Urban Pop %'] = df1pop['Urban Pop %'].apply(lambda x: float(x.rstrip('%'))/100)
df1pop['World Share'] = df1pop['World Share'].apply(lambda x: float(x.rstrip('%'))/100)
df1pop['Yearly Change'] = df1pop['Yearly Change'].apply(lambda x: float(x.rstrip('%'))/100)
df1pop['Med. Age'] = df1pop['Med. Age'].astype('float')
df1pop['Fert. Rate'] = df1pop['Fert. Rate'].astype('float')

In [None]:
# We need to bin some values to show them in plot
df1pop['pop_quantile'] = pd.qcut(df1pop['Population (2020)'], 10, labels=False)
df1pop['density_quantile'] = pd.qcut(df1pop['Density (P/Km²)'], 10, labels=False)

### Let's see if there is any pattern between Median Age, Happiness score & Population

In [None]:
fig = px.scatter(df1pop,
    x="Med. Age", y='Ladder score', size=df1pop["pop_quantile"]**2, color="Regional indicator",
    labels={"Ladder score": 'Happiness index Score',
           'pop_quantile': 'Population Quantile'},size_max=20,
    title="Happiness Score, Median Age and Population         ",
                 opacity=0.7,hover_name='Country')
fig.update_traces(marker_sizemode='area')
fig.update_traces(showlegend=True)
fig.show()

It seems that **happier** countries have older population. There is a linear pattern with Median age & happiness score.

### Between Fertility Rate vs GDP vs Median Age

In [None]:
fig = px.scatter(df1pop,
    x="Fert. Rate", y='GDP per capita', size="Med. Age", color="Regional indicator",
    labels={"Ladder score": 'Happiness Score',
           'pop_quantile': 'Population Quantile',
           'Fert. Rate': 'Fertility Rate'},size_max=20,
    title="Between Fertility Rate vs GDP vs Median Age",
                 opacity=0.7,hover_name='Country',hover_data=['Fert. Rate','GDP per capita','Med. Age','Ladder score'])
fig.update_traces(marker_sizemode='area')
fig.update_traces(showlegend=True)
fig.show()

Also as we saw before, happier countries have higher median age. Here Median age is described as the size of marker & we can see that bigger bubble are toward left-top of plot, i.e High GDP per Capita & low Fertility Rate
