In [None]:
#Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#only projecting columns we need
proj_columns = ['Country', 'Ladder Score','Freedom', 'GDP', 'Life Expectancy', 'Generosity', 'Year']

#reading 2021 by itself because it has regional indictor which other csvs are muissing
df_2021 = pd.read_csv(f"Resources/years/2021.csv")[proj_columns + ['Regional indicator']]

happiness_combined_df = df_2021

#projecting only two columns 
country_df = df_2021[['Country', 'Regional indicator']]

#creating dictionary to  match regional indicators to countries
country_dict = country_df.set_index('Country').to_dict()['Regional indicator']

#looping through the years exct 2021 which we have already done and adding
#to a new combined dataframe 
for yr in range(2017, 2021):
    df_yr = pd.read_csv(f"Resources/years/{yr}.csv")[proj_columns]
    #adding regional indicator value to all years from country dict bove
    df_yr['Regional indicator'] = df_yr['Country'].apply(lambda x: country_dict.get(x, ''))
    #combining
    happiness_combined_df = pd.concat([happiness_combined_df,
        df_yr]) 

#happiness_combined_df.to_csv('Resources/combined_happiness.csv')
happiness_combined_df.head(5)

In [None]:
# #DATA CLEANING
# #identify the columns with no regional indicator
# #happiness_combined_df['Regional indicator'] == ''][['Country','Year']]
# #pd.pivot_table(happiness_combined_df, values='Country', columns='Year', aggfunc='count')
# before_clean_df = happiness_combined_df
# before_clean_df.pivot(index='Country', columns='Year', values='Country').to_csv('Resources/pivoted-for-missing-val.csv')


In [None]:
#looking at the pivoted data set, it was revealed the countries were named diffrently in diffrent years,
#to solve that, these countries were renamed to one consistent value across years.
# ALso, some countries did not have any data for some years. For instance, Belize, Angola, Congo, Qatar are missing
#for most recent years. From our google research, It may be due to political situation and or high crime rate in Belize, 
#Angola and Congo and Central African Republic.
#Renaming the countries
rename_cntry_dict = {
    'Hong Kong S.A.R. of China': 'Hong Kong',
    'Hong Kong S.A.R., China': 'Hong Kong',
    'Trinidad & Tobago': 'Trinidad and Tobago',
    'North Macedonia': 'Macedonia',
    'Northern Cyprus': 'North Cyprus',
    'Taiwan Province of China': 'Taiwan',
}
rename_ri_dict = {
    'Hong Kong': 'East Asia',
    'Trinidad and Tobago': 'Latin America and Caribbean',
    'Macedonia': 'Central and Eastern Europe',
    'North Cyprus': 'Western Europe',
    'Taiwan': 'East Asia',
    'Central African Republic': 'Sub-Saharan Africa',
    'South Sudan': 'Sub-Saharan Africa',
    'Congo (Kinshasa)': 'Sub-Saharan Africa'
}

# in order to loop through data frame, call iterrows function
# which provides us an index of the row and all data in the row
for i, row in happiness_combined_df.iterrows():
    # grab either the rename value of the country, or if it is not
    # in the dictionary, just use the value already in the row as default
    #of the .get() function. We have to store this in a variable to use it 
    # for regional indicator below
    country = rename_cntry_dict.get(row['Country'], row['Country'])
    # using dataframe .at[], index from iterrows, and column name to set value
    happiness_combined_df.at[i, 'Country'] = country
    happiness_combined_df.at[i, 'Regional indicator'] = rename_ri_dict.get(country, row['Regional indicator'])

happiness_combined_df


In [None]:
# save csv after cleaning
happiness_combined_df.to_csv('Resources/combined_happiness_after_clean.csv')


In [None]:
#Data_check: identify the columns with no regional indicator
happiness_combined_df[happiness_combined_df['Regional indicator'] == '']['Country'].unique()

In [None]:
#identify top 5 happy and bottom 5 countries by ladder score in each year.
happiness_combined_df['rank'] = happiness_combined_df.groupby(['Year'])['Ladder Score'].rank('max', ascending=False)#.sort_values(['Year', 'rank'])
happiness_combined_df = happiness_combined_df.set_index(['Year', 'rank']).sort_index()
happiness_combined_df.to_csv('Resources/combined_happiness_indexed.csv')
happiness_combined_df


In [None]:
# found the average life expecatancy of world happiness to show how it improved every year.

average_life_expect= happiness_combined_df.groupby('Year')['Life Expectancy'].mean()
average_life_expect

In [None]:
#Bar graph for avg life expecatancy of world happiness

average_life_expect.plot(kind="bar")
plt.xlabel("Year")
plt.xticks(rotation=90)
plt.ylabel("Average Life Expectancy of World Happiness Report")
plt.show()

In [None]:
multi_group = happiness_combined_df.groupby(['Year','Regional indicator','Country'])[['Life Expectancy']].mean()
multi_group
#clean_group_df = multi_group.dropna(how='all', axis=1)

#clean_multigrp= multi_group.dropna(axis=0, subset=['Country'])


In [None]:
sns.set_style('dark')
plot = sns.relplot(data=multi_group, x='Year', y='Life Expectancy',hue='Regional indicator', alpha=0.7, edgecolors="orange", linewidth=0.1).set(xscale="linear")
#plot = sns.relplot(x= 'Score', y ='Total Amount Deposited', data =multi_group, hue = 'Predicted Category',alpha=0.7, edgecolors="orange", linewidth=0.1)
plot.set_axis_labels("Years", "Life Expectancy",color = "brown")
plot.fig.set_size_inches(15,5)


In [None]:
sns.set(style = 'whitegrid')
#fmri = sns.load_dataset("fmri")
  
g = sns.lineplot(x ="Year",
             y ="Life Expectancy",
             hue ="Regional indicator",
             data = multi_group,
             alpha=0.9, 
             linewidth=0.1)
g.legend(loc='center left', bbox_to_anchor=(1.25,0.5), ncol=1)

In [None]:
happiness_combined_df.groupby(['Year','Country'])['Life Expectancy']
happiness_combined_df

In [None]:
happiness_combined_df.sort_values(by = ["Life Expectancy"],ascending=[False])
#df1=happiness_combined_df[:5]
#colormap= df1.plot('Country',['Ladder Score','Freedom','GDP','Life Expectancy'],kind = 'area',
   #     colormap='gist_rainbow')
#colormap.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)

In [None]:
average_life_exp= happiness_combined_df.groupby('Country','Life Expectancy')
average_life_exp
#average_life_exp.sort_values(by=['Life Expectancy'], ascending=True)
#average_life_exp.sort_values(by =["Life Expectancy"],ascending=[False])
