WORLD HAPPINESS REPORT

In [None]:
# Importing all the project pre-requisites
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

# Import essential models and functions from sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
import graphviz

# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Import essential models and functions from plotly
import plotly
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Allows us to use Plotly offline
init_notebook_mode(connected=True)

In [None]:
data1=pd.read_csv("data/factor_of_happiness2018.csv")
data2=pd.read_csv("data/factor_of_happiness2018(2).csv")

print("Data type : ", type(data2))
print("Data dims : ", data2.shape)

In [None]:
# Get the overview of the data
data2.head()

In [None]:
# Information about the Variables
data2.info()

In [None]:
# Sort the dataset by Countries instead of Happiness Score
data1.sort_values(by = ["Country"], inplace = True, ascending = True)
data1 = data1.reset_index(drop=True)
data1 = pd.DataFrame(data1[:156])
data = pd.concat([data1,data2],axis=1)

# Convert all Variable Names to UPPERCASE
data.columns = data.columns.str.upper()

# Remove all spaces and dots from Variable Names
data.columns = data.columns.str.replace(".","")
data.columns = data.columns.str.replace(" ","_")
data.columns = data.columns.str.replace(":","")
data.columns = data.columns.str.replace("+","")
data.columns = data.columns.str.replace(",","")

In [None]:
# Extracting the required variables from the dataframe
extracts = ["COUNTRY", "HAPPINESS_SCORE", "LOG_OF_GDP_PER_PERSON_2015-2017", "GDP_PER_PERSON_2015-2017", "HEALTHY_LIFE_EXPECTANCY_2015-2017", \
            "SOCIAL_SUPPORT_2015-2017", "FREEDOM_TO_MAKE_LIFE_CHOICES_2015-2017", \
            "GENEROSITY_2015-2017_WITHOUT_ADJUSTMENT_FOR_GDP_PER_PERSON", "PERCEPTIONS_OF_CORRUPTION_2015-2017"]

data = pd.DataFrame(data[extracts])

In [None]:
# Rename the columns of the dataframe for easier readability
data.rename(columns = {'GDP_PER_PERSON_2015-2017': 'GDP_PER_PERSON'}, inplace = True)
data.rename(columns = {'LOG_OF_GDP_PER_PERSON_2015-2017': 'LOG_OF_GDP_PER_PERSON'}, inplace = True)
data.rename(columns = {'FREEDOM_TO_MAKE_LIFE_CHOICES_2015-2017': 'FREEDOM'}, inplace = True)
data.rename(columns = {'HEALTHY_LIFE_EXPECTANCY_2015-2017': 'HEALTHY_LIFE_EXPECTANCY'}, inplace = True)
data.rename(columns = {'SOCIAL_SUPPORT_2015-2017': 'SOCIAL_SUPPORT'}, inplace = True)
data.rename(columns = {'GENEROSITY_2015-2017_WITHOUT_ADJUSTMENT_FOR_GDP_PER_PERSON': 'GENEROSITY'}, inplace = True)
data.rename(columns = {'PERCEPTIONS_OF_CORRUPTION_2015-2017': 'PERCEPTIONS_OF_CORRUPTION'}, inplace = True)

In [None]:
# Remove duplicate columns (if any)
data = data.loc[:,~data.columns.duplicated()]
data.head()

In [None]:
data1 = dict(type = 'choropleth', 
           locations = data['COUNTRY'],
           locationmode = 'country names',
           z = data['HAPPINESS_SCORE'], 
           text = data['COUNTRY'],
           colorbar = {'title':'Happiness'})
layout = dict(title = 'Happiness Index 2018', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choromap3 = go.Figure(data = [data1], layout=layout)
iplot(choromap3)

In [None]:
# Remove rows with missing values
data = data.dropna(how='any',axis=0)

# After removing missing values, reset the index
data = data.reset_index(drop=True)

In [None]:
# Summary Statistics for all Variables
data.describe().round(2)

In [None]:
# Calculate the complete correlation matrix
data.corr().round(2)

In [None]:
# Heatmap of the Correlation Matrix
f, axes = plt.subplots(1, 1, figsize=(12, 8))
sb.heatmap(data.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

In [None]:
# Dataframe of the Happiness Score
happiness_score = pd.DataFrame(data["HAPPINESS_SCORE"])
happiness_score.describe().round(2)

In [None]:
f,axes = plt.subplots(1,2,figsize=(15,4))
sb.distplot(happiness_score,kde=True,hist=True,color="g",ax=axes[0])
sb.violinplot(happiness_score,ax=axes[1],color="r")

In [None]:
# Add our Happiness Category into the Happiness_Score dataframe
happiness=[]
for i in happiness_score["HAPPINESS_SCORE"]:
    if i<4:
        happiness.append("UNHAPPY")
    elif 4<=i<=6:
        happiness.append("NORMAL")
    else:
        happiness.append("HAPPY")

# Join our Hapiness_Score dataframe into the main dataframe
pred_happiness = pd.DataFrame(happiness,columns=["PREDICTED_HAPPINESS"])
pred_happiness = pd.DataFrame(pred_happiness["PREDICTED_HAPPINESS"].astype('category'))
data = pd.concat([data,pred_happiness],axis=1)

In [None]:
# Remove any duplicate columns (if any)
data = data.loc[:,~data.columns.duplicated()]
data.head()

In [None]:
data["PREDICTED_HAPPINESS"].value_counts()

In [None]:
f, axes = plt.subplots(1, 1, figsize=(5, 4))
sb.countplot(data["PREDICTED_HAPPINESS"])
plt.xlabel("Predicted Happiness")
plt.ylabel("Number of Countries")