# **Data Visualization final project- World Happiness Report**

In [None]:
#libraries import
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import files, drive
import io
import matplotlib.pyplot as plt
from collections import Counter
!pip install pycountry_convert
import pycountry_convert as pc

In [None]:
# functions
def Detect_Outliers(df,n,features):
    # this function helps us detect where we have outliers values
    outlier_indices = []
    for col in features:  # iterate over features(columns)
        Q1 = np.percentile(df[col],25)  # 1st quartile (25%)
        Q3 = np.percentile(df[col],75)  # 3rd quartile (75%)
        IQR = Q3 - Q1  # Interquartile range (IQR)
        outlier_step = 1.5 * IQR  # outlier step
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index  # determine a list of indices of outliers for feature col
        outlier_indices.extend(outlier_list_col)  # append the found outlier indices for col to the list of outlier indices
    outlier_indices = Counter(outlier_indices)  # select observations containing more than 2 outliers
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers

def condition(x):
    # this function will help us solve few undetected country name's to the continent library
    if x=='Congo (Brazzaville)' or x=='Congo (Kinshasa)':
        return country_to_continent('Congo')
    elif x=='Kosovo':
        return country_to_continent('Serbia')
    elif x=='North Cyprus':
        return country_to_continent('Cyprus')
    elif x=='Hong Kong S.A.R. of China':
        return country_to_continent('Hong Kong')
    elif x=='Palestinian Territories':
        return  country_to_continent('Palestine')
    elif x=='Somaliland region':
        return country_to_continent('Somaliland')
    elif x=='Taiwan Province of China':
        return country_to_continent('Taiwan')
    else:
        return country_to_continent(x)

def country_to_continent(country_name):
    # this function will help us bring the continent name of the country
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

In [None]:
# main code

# data importing
uploaded = files.upload()
happiness_report_origin = pd.read_excel(io.BytesIO(uploaded['WHR2018Chapter2OnlineData.xls']))
happiness_report= happiness_report_origin.copy()

In [None]:
# data exploring
    # descriptive statistics
happiness_report.dtypes

In [None]:
happiness_report.info()

as we can see, most of our variables are from type "float" except for country (text) and year (int).
Moreover, we can see that we have a different amount of non-null count in each variable. We'll dill with it later.


In [None]:
happiness_report.head(10)

From looking at the first 10 rows of the data we learn that each row represents specific country at a specific year.
On top of that, already from a preliminary point of view of the data, we can see that in some columns we have null values which will need to be addressed later.


In [None]:
    # null checks
happiness_null_values = happiness_report.isnull().sum()
print(happiness_null_values)

In [None]:
    # columns drop
happiness_report= happiness_report.drop(['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year',
                                          'GINI index (World Bank estimate)', 'GINI index (World Bank estimate), average 2000-15',
                                          'gini of household income reported in Gallup, by wp5-year'], axis=1)

In [None]:
    # duplicate check
num_duplicates = happiness_report.duplicated().sum()
print('There are {} duplicate rows present in the happiness report dataset'.format(num_duplicates))

There are 0 duplicate rows present in the happiness report dataset


as we can see there are no duplicated rows in the dataset.

In [None]:
    # detect outliers
Outliers_to_drop = Detect_Outliers(happiness_report,2,['Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect',
       'Confidence in national government', 'Democratic Quality',
       'Delivery Quality'])
happiness_report.loc[Outliers_to_drop] # Show the outliers rows

as we can see, there are no outliers detected among the dataset.

In [None]:
# descriptive statistics on numerical data
happiness_report.describe()

From the descriptive statistics on the numeric data we learn the following facts:
1. the year range of the dataset in 2005-2017.
2. the country with the highest happiness rate has a rank of 8.018934 and the country with the lowest rate has a rank of 2.661718.
3. at least 75% of observations among "Perceptions of corruption" believe that there is corruption in politics and bussiness (25th percentile
=0.697359).
4. at least 75% of observations among "Positive affect" said that they experiance happiness, joy and laughter (25th percentile
=0.621471).
5. moreover, most of the observations said that they did not experianced a feeling of worry, sadness and anger (75% are <=0.311515).
6. less then 25% of observations said that they do not have someone to count on in times of trouble (<0.748304).


In [None]:
# descriptive statistics on categorical data
happiness_report.describe(include='object')

From the descriptive statistics on the categorical data (country variable) we learn that in this report 164 countries took part.

###**Univariate Analysis:**

In [None]:
# country frequency
plt.figure(figsize=(7,50))
plt.title('Country frequency')
ax= sns.countplot(data= happiness_report, y='country', order= happiness_report['country'].value_counts().index)
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# new column for continent
happiness_report['continent']= happiness_report['country'].apply(condition)

In [None]:
  # continent frequency
plt.title('Continent frequency')
ax= sns.countplot(data= happiness_report, y='continent', order= happiness_report['continent'].value_counts().index)
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# year frequency
plt.title('Years frequency')
ax= sns.countplot(data= happiness_report, x='year')
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
# Life Ladder density
plt.title('Life Ladder density')
ax= sns.kdeplot(data=happiness_report, x='Life Ladder')
plt.show()

In [None]:
# Log GDP per capita density
plt.title('Log GDP per capita density')
ax= sns.kdeplot(data=happiness_report, x='Log GDP per capita')
plt.show()

As we can see from the graph, we have one global maximum point and two more local maximum points. In order to get their values we will run the following code which will tell us our main maximum point (9.6214614, 0.3198163).

In [None]:
data = ax.lines[0].get_xydata()
data[np.where(data[:, 1] == max(data[:, 1]))]

array([[9.6214614, 0.3198163]])

In [None]:
# Social support density
plt.title('Social support density')
ax= sns.kdeplot(data=happiness_report, x='Social support')
plt.show()

In [None]:
# Healthy life expectancy at birth density
plt.title('Healthy life expectancy at birth density')
ax= sns.kdeplot(data=happiness_report, x='Healthy life expectancy at birth')
plt.show()

Here again, we see many maximun points so we will use the same code to find the global maximum point values.

In [None]:
data = ax.lines[0].get_xydata()
data[np.where(data[:, 1] == max(data[:, 1]))]

array([[6.42773597e+01, 6.08071757e-02]])

In [None]:
# Freedom to make life choices density
plt.title('Freedom to make life choices density')
ax= sns.kdeplot(data=happiness_report, x='Freedom to make life choices')
plt.show()

In [None]:
# Generosity density
plt.title('Generosity density')
ax= sns.kdeplot(data=happiness_report, x='Generosity')
plt.show()

In [None]:
# Perceptions of corruption density
plt.title('Perceptions of corruption density')
ax= sns.kdeplot(data=happiness_report, x='Perceptions of corruption')
plt.show()

Here again, we see two maximun points so we will use the same code to find the global maximum point values.

In [None]:
data = ax.lines[0].get_xydata()
data[np.where(data[:, 1] == max(data[:, 1]))]

array([[0.85189995, 3.27023532]])

In [None]:
# Positive affect density
plt.title('Positive affect density')
ax= sns.kdeplot(data=happiness_report, x='Positive affect')
plt.show()

In [None]:
# Negative affect density
plt.title('Negative affect density')
ax= sns.kdeplot(data=happiness_report, x='Negative affect')
plt.show()

In [None]:
# Confidence in national government density
plt.title('Confidence in national government density')
ax= sns.kdeplot(data=happiness_report, x='Confidence in national government')
plt.show()

In [None]:
# Democratic Quality density
plt.title('Democratic Quality density')
ax= sns.kdeplot(data=happiness_report, x='Democratic Quality')
plt.show()

In [None]:
# Delivery Quality density
plt.title('Delivery Quality density')
ax= sns.kdeplot(data=happiness_report, x='Delivery Quality')
plt.show()

### **Bivariate Analysis:**

In [None]:
# Life Ladder vs country
country_life=happiness_report.groupby(['country'])['Life Ladder'].mean().sort_values(ascending=False)
plt.figure(figsize=(7,50))
ax= sns.boxplot(data=happiness_report, x='Life Ladder', y='country', order=country_life.index)
plt.title('Life Ladder vs Country')
plt.show()

In [None]:
# Life Ladder vs continent
continent_life_ladder = happiness_report.groupby('continent')['Life Ladder'].mean().sort_values()

# Plot the graph again
plt.figure(figsize=(10, 6))
ax= continent_life_ladder.plot(kind='bar', color='skyblue')
ax.bar_label(ax.containers[0])
plt.title('Average Life Ladder by Continent')
plt.ylabel('Average Life Ladder')
plt.xlabel('Continent')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Life Ladder vs year
avg_life_ladder_per_year = happiness_report.groupby('year')['Life Ladder'].mean().reset_index()
sns.lineplot(data=avg_life_ladder_per_year, x='year', y='Life Ladder')
plt.title('Average Life Ladder vs Year')
plt.ylabel('Average Life Ladder')
plt.xlabel('Year')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
# Life Ladder vs Healthy life expectancy at birth
ax= sns.scatterplot(data=happiness_report, x='Life Ladder', y='Healthy life expectancy at birth')
ax= sns.rugplot(data=happiness_report, x='Life Ladder', y='Healthy life expectancy at birth')
plt.title('Life Ladder vs Healthy life expectancy')
plt.show()

In [None]:
# Life Ladder vs Freedom to make life choices
ax= sns.scatterplot(data=happiness_report, x='Life Ladder', y='Freedom to make life choices')
plt.title('Life Ladder vs Freedom to make life choices')
plt.show()

In [None]:
# Life Ladder vs Log GDP per capita
ax= sns.scatterplot(data=happiness_report, x='Life Ladder', y='Log GDP per capita')
plt.title('Life Ladder vs Log GDP per capita')
plt.show()

In [None]:
# Life Ladder vs Social support
ax = sns.scatterplot(data=happiness_report, x='Life Ladder', y='Social support')
plt.title('Life Ladder vs Social support')
plt.show()


In [None]:
# Life Ladder vs Generosity
ax = sns.scatterplot(data=happiness_report, x='Life Ladder', y='Generosity')
plt.title('Life Ladder vs Generosity')
plt.show()


In [None]:
# Plotting the correlation of Life Ladder with all other metrics using a different visualization: correlation heatmap
correlation_matrix = happiness_report.corr()

plt.figure(figsize=(14,10))
sns.heatmap(correlation_matrix[['Life Ladder']], annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation of Life Ladder with Other Metrics')
plt.show()


In [None]:
# correlation matrix
corrMatrix = happiness_report.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corrMatrix, annot=True)
plt.title('Correlation Metrix of all numeric variables')
plt.show()

In [None]:
# distribution of all numeric variables
ax= sns.pairplot(happiness_report.iloc[:,2:], corner=True)
plt.title('Multivariate Analysis of numeric data', loc='center', y=13)
plt.show()

### **Multivariate Analysis:**

In [None]:
# life ladder vs log GDP and continent
ax= sns.jointplot(data=happiness_report, x='Life Ladder', y='Log GDP per capita', hue='continent')
plt.title('Life Ladder, log GDP and continent', loc='center', y=1.2)
plt.show()

In [None]:
# Healthy life expectancy at birth vs log GDP and year and continents
ax= sns.displot(happiness_report, x='Log GDP per capita', y='Healthy life expectancy at birth', hue='continent', kind="kde")
plt.title('Healthy life expectancy at birth, log GDP and continent')
plt.show()

In [None]:
# Healthy life expectancy at birth vs log GDP and year and Africa
ax= sns.displot(happiness_report, x='Log GDP per capita', y='Healthy life expectancy at birth', hue=happiness_report['continent']=='Africa', kind="kde")
plt.title('Africa or not- Healthy life expectancy at birth, log GDP')
plt.show()

In [None]:
# Delivery quality among the years and continents
avg_delivery_continent = happiness_report.groupby(['year', 'continent'])['Delivery Quality'].mean().reset_index()
ax= sns.lineplot(data=avg_delivery_continent, x='year', y='Delivery Quality', hue='continent')
plt.title('Average delivery qulity, years and continent')
plt.ylabel('Delivery qulity average')
plt.xlabel('Years')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.legend(loc='upper right')
plt.show()

### **Null filling:**

In [None]:
# null filling in positive/negative distributions
happiness_report[['Social support', 'Freedom to make life choices', 'Generosity', 'Negative affect', 'Confidence in national government']] = happiness_report[['Social support',
                'Freedom to make life choices', 'Generosity', 'Negative affect', 'Confidence in national government']].fillna(happiness_report[['Social support',
                'Freedom to make life choices', 'Generosity', 'Negative affect', 'Confidence in national government']].median())

In [None]:
# null filling in other distributions
happiness_report[['Log GDP per capita', 'Healthy life expectancy at birth', 'Perceptions of corruption', 'Positive affect', 'Democratic Quality', 'Delivery Quality']] = happiness_report[['Log GDP per capita',
                'Healthy life expectancy at birth', 'Perceptions of corruption', 'Positive affect', 'Democratic Quality', 'Delivery Quality']].fillna(happiness_report[['Log GDP per capita',
                'Healthy life expectancy at birth', 'Perceptions of corruption', 'Positive affect', 'Democratic Quality', 'Delivery Quality']].median())

In [None]:
# saving the final dataframe
drive.mount('/drive')
happiness_report.to_excel('/drive/My Drive/happiness_report_final.xlsx', index=False)

Mounted at /drive
