In [37]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [41]:
# load all world bank data
# columns: 
# Country Name -> Location, 
# Country Code -> SpatialDimValueCode , 
# Year, (ww_value), ExtraFeatures
df_arg = pd.read_csv("Agriculture/agr_land.csv")
df_cpi = pd.read_csv("CPI/ConsumerPriceIndex(2010=100).csv")
df_edu = pd.read_csv("edu/school_enrollment.csv")
df_employ = pd.read_csv("Employment/EmploymentRate.csv")
df_exchange = pd.read_csv("exchange_rate/exchange_to_usd.csv")
df_GDP = pd.read_csv("GDP/GDP_per_capita.csv")
df_green = pd.read_csv("greenhouse/gas_emission.csv")
df_income = pd.read_csv("income/income_per_capita.csv")
df_pop = pd.read_csv("population/pop.csv")
df_tax = pd.read_csv("tax/tax.csv")

In [42]:
# unique countries
countries = df_arg['Country Name'].unique()
num_countries = len(countries)
print(num_countries)
# unique years
print(df_arg.columns) # 1960-2020 => 61
yrs = 61

266
Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020'],
      dtype='object')


In [43]:
def one_wordbank_feature(df,country):
    return df.loc[df['Country Name'] == country, '1960':'2020'].values

In [44]:
# directly create dictionary
d = dict()
d["Location"] = np.repeat(list(df_arg['Country Name']),yrs)
d["SpatialDimValueCode"] = np.repeat(list(df_arg['Country Code']),yrs)
d["Year"] = np.tile(list(df_arg.columns)[4:],num_countries)
d["arg"] = np.concatenate([one_wordbank_feature(df_arg,c) for c in countries]).ravel()
d["cpi"] = np.concatenate([one_wordbank_feature(df_cpi,c) for c in countries]).ravel()
d["edu"] = np.concatenate([one_wordbank_feature(df_edu,c) for c in countries]).ravel()
d["employ"] = np.concatenate([one_wordbank_feature(df_employ,c) for c in countries]).ravel()
d["exchange"] = np.concatenate([one_wordbank_feature(df_exchange,c) for c in countries]).ravel()
d["gdp"] = np.concatenate([one_wordbank_feature(df_GDP,c) for c in countries]).ravel()
d["green"] = np.concatenate([one_wordbank_feature(df_green,c) for c in countries]).ravel()
d["income"] = np.concatenate([one_wordbank_feature(df_income,c) for c in countries]).ravel()
d["pop"] = np.concatenate([one_wordbank_feature(df_pop,c) for c in countries]).ravel()
d["tax"] =  np.concatenate([one_wordbank_feature(df_tax,c) for c in countries]).ravel()

In [2]:
#print(d)
#assert(len(d["Location"]) == len(d["Year"]))
#assert(len(d["arg"]) == len(d["Year"]))

In [45]:
df_wb = pd.DataFrame(d)
#df_wb.to_csv("wb.csv",index=False)

In [46]:
def happy_feature(countries, years, features):
    res = []
    for country in tqdm(countries):
        for year in years:
            try:
                entry = df_happy.loc[((df_happy['Country name'] == country) 
                                      & (df_happy['year'] == int(year))),
                                     features].values[0]
                res.append(entry)
            except:
                entry = np.nan
                res.append(entry)
    return res
    
# merge happiness table
df_happy = pd.read_csv('Happiness/WHR2021.csv')
count_hap = df_happy['Country name'].unique()
count_hap_num = len(count_hap)
dh = dict()
dh['Location'] = np.repeat(count_hap,yrs)
dh['Year'] = np.tile(list(df_arg.columns)[4:],count_hap_num)
dh['LifeLadder'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Life Ladder')
dh['LogGDPPerCapita'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Log GDP per capita')
dh['SocialSupport'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Social support')
dh['Expectancy'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Healthy life expectancy at birth')
dh['Freedom'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Freedom to make life choices')
dh['Generosity'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Generosity')
dh['Corruption'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Perceptions of corruption')
dh['PositiveAffect'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Positive affect')
dh['NegativeAffect'] = happy_feature(count_hap,list(df_arg.columns)[4:],'Negative affect')

100%|██████████| 166/166 [00:15<00:00, 10.46it/s]
100%|██████████| 166/166 [00:10<00:00, 16.11it/s]
100%|██████████| 166/166 [00:14<00:00, 11.13it/s]
100%|██████████| 166/166 [00:10<00:00, 15.33it/s]
100%|██████████| 166/166 [00:13<00:00, 12.07it/s]
100%|██████████| 166/166 [00:11<00:00, 13.99it/s]
100%|██████████| 166/166 [00:19<00:00,  8.32it/s]
100%|██████████| 166/166 [00:18<00:00,  9.03it/s]
100%|██████████| 166/166 [00:12<00:00, 13.41it/s]


In [1]:
#print(dh)
#assert(len(dh["LifeLadder"]) == len(dh["Year"]))

In [52]:
dff_happy = pd.DataFrame(dh)
df_wbb = df_wb.merge(dff_happy, how='left')
df_wbb.to_csv("extra.csv",index=False)

In [106]:
df_wbb.head()

Unnamed: 0,Location,SpatialDimValueCode,Year,arg,cpi,edu,employ,exchange,gdp,green,...,tax,LifeLadder,LogGDPPerCapita,SocialSupport,Expectancy,Freedom,Generosity,Corruption,PositiveAffect,NegativeAffect
0,Aruba,ABW,1960,,,,,,,,...,,,,,,,,,,
1,Aruba,ABW,1961,11.111111,,,,,,,...,,,,,,,,,,
2,Aruba,ABW,1962,11.111111,,,,,,,...,,,,,,,,,,
3,Aruba,ABW,1963,11.111111,,,,,,,...,,,,,,,,,,
4,Aruba,ABW,1964,11.111111,,,,,,,...,,,,,,,,,,


In [51]:
df_ww = pd.read_csv("ProvidedDatasets/tobacco_use_ww.csv")
# group by Location  
df_wbb['Year'] = df_wbb['Year'].astype(str)
df_ww['Year'] = df_ww['Year'].astype(str)
df_www = df_ww.merge(df_wbb,how='left')
df_www.to_csv("extra_ww.csv",index=False)

## EDA

In [32]:
df_www = pd.read_csv("extra_ww.csv")
df_www.head()

Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Year,Gender,Value,arg,cpi,edu,...,tax,LifeLadder,LogGDPPerCapita,SocialSupport,Expectancy,Freedom,Generosity,Corruption,PositiveAffect,NegativeAffect
0,SEAR,South-East Asia,PRK,Democratic People's Republic of Korea,2018,Female,0.0,,,,...,,,,,,,,,,
1,EUR,Europe,AZE,Azerbaijan,2018,Female,0.2,57.825393,152.9,94.48008,...,12.971272,5.168,9.562,0.781,65.5,0.772,-0.232,0.561,0.593,0.191
2,AFR,Africa,ERI,Eritrea,2018,Female,0.3,75.168317,,47.697109,...,,,,,,,,,,
3,EMR,Eastern Mediterranean,EGY,Egypt,2018,Female,0.4,,,,...,,,,,,,,,,
4,AFR,Africa,GHA,Ghana,2018,Female,0.4,64.967654,231.807235,71.319687,...,12.616039,5.004,8.555,0.761,57.2,0.817,0.062,0.846,0.747,0.25


In [33]:
import plotly

In [34]:
import plotly.graph_objects as go
import plotly.express as px


'''fig1 = px.line(df_www, x="Year", y="green", 
               color='Location', title='annual greenhouse gas', markers=True)
fig1.show()

fig2 = px.line(df_www, x="Year", y="cpi", 
               color='Location', title='annual Consumer Price Index', markers=True)
fig2.show()

fig3 = px.line(df_www, x="Year", y="employ", 
               color='Location', title='annual school enrollment rate', markers=True)
fig3.show()

fig3 = px.line(df_www, x="Year", y="gdp", 
               color='Location', title='annual school gdp per capita', markers=True)
fig3.show()'''

fig3 = px.line(df_www, x="Year", y="LifeLadder", 
               color='Location', title='happiness', markers=True)
fig3.show()


In [35]:
row,col = df_www.shape
print(df_www.isna().sum()/row)
df_www.describe()


ParentLocationCode     0.000000
ParentLocation         0.000000
SpatialDimValueCode    0.000000
Location               0.000000
Year                   0.000000
Gender                 0.000000
Value                  0.000000
arg                    0.122297
cpi                    0.183445
edu                    0.358688
employ                 0.422073
exchange               0.255034
gdp                    0.127517
green                  0.120805
income                 0.170768
pop                    0.125280
tax                    0.378822
LifeLadder             0.457867
LogGDPPerCapita        0.457867
SocialSupport          0.463087
Expectancy             0.457867
Freedom                0.468307
Generosity             0.479493
Corruption             0.486204
PositiveAffect         0.466070
NegativeAffect         0.463833
dtype: float64


Unnamed: 0,Year,Value,arg,cpi,edu,employ,exchange,gdp,green,income,...,tax,LifeLadder,LogGDPPerCapita,SocialSupport,Expectancy,Freedom,Generosity,Corruption,PositiveAffect,NegativeAffect
count,4023.0,4023.0,3531.0,3285.0,2580.0,2325.0,2997.0,3510.0,3537.0,3336.0,...,2499.0,2181.0,2181.0,2160.0,2181.0,2139.0,2094.0,2067.0,2148.0,2157.0
mean,2012.0,24.807159,41.582934,112.075145,86.508486,56.878077,478.605236,14399.1291,242658.7,11388.558477,...,18.168774,5.555164,9.513971,0.819067,64.475873,0.769041,0.005633,0.730441,0.717851,0.270965
std,5.657557,15.745702,21.359801,40.345453,30.00847,10.748094,1443.963331,19629.350747,994579.4,15065.16642,...,8.147519,1.166266,1.180462,0.117282,7.265953,0.133016,0.170545,0.203202,0.104378,0.082741
min,2000.0,0.0,0.930889,14.991313,6.19735,18.32,0.044468,99.74492,20.0,84.451837,...,0.043495,2.702,6.635,0.434,32.3,0.304,-0.335,0.047,0.421,0.103
25%,2010.0,12.6,25.273088,100.0,69.107813,50.66,2.701899,1512.126989,10290.0,1267.380191,...,13.032967,4.607,8.555,0.75075,59.3,0.687,-0.117,0.664,0.63875,0.209
50%,2014.0,23.9,42.857299,108.846334,95.352978,56.56,15.375,5506.820723,43670.0,4429.395435,...,17.443762,5.526,9.699,0.852,66.1,0.789,-0.024,0.8,0.734,0.261
75%,2016.0,34.6,58.063901,121.558878,104.496511,62.9,222.656086,18098.90854,126380.0,14185.895722,...,22.819233,6.467,10.529,0.913,70.5,0.882,0.096,0.867,0.80425,0.321
max,2018.0,91.0,85.464518,481.390314,163.934723,93.97,14236.93877,118823.6484,12355240.0,80639.38778,...,110.182883,8.019,11.645,0.987,76.8,0.985,0.698,0.983,0.944,0.591


In [36]:
df_www.groupby('ParentLocation')
fig = px.line(df_www.get_group('Asia'), x="Year", y="LifeLadder", 
               color='Location', title='happiness', markers=True)
fig.show()

AttributeError: 'DataFrame' object has no attribute 'get_group'