In [157]:
import pandas as pd
import numpy as np
import seaborn as sb

In [158]:
df = pd.read_csv('datasets/emissions.csv')

In [159]:
# drop all columns except for first and last column
df = df.iloc[:, [0, -1]]

In [160]:
df.describe()

Unnamed: 0,2017
count,231.0
mean,17323730000.0
std,111542300000.0
min,0.0
25%,34218870.0
50%,273731900.0
75%,2987929000.0
max,1580000000000.0


In [161]:
df.rename(columns = {'2017':'CO2'}, inplace = True)

In [162]:
df

Unnamed: 0,Country,CO2
0,Afghanistan,1.785029e+08
1,Africa,4.311757e+10
2,Albania,2.772782e+08
3,Algeria,4.107870e+09
4,Americas (other),9.864116e+10
...,...,...
226,Wallis and Futuna Islands,4.295730e+05
227,World,1.580000e+12
228,Yemen,6.194328e+08
229,Zambia,1.873686e+08


In [163]:
# merge two dataframes on country name
temp = pd.read_csv('datasets/population.csv')
temp = temp.iloc[:, [2, 6]]
temp.rename(columns = {'Country/Territory':'Country', '2020 Population' : 'Population'}, inplace = True)
temp

Unnamed: 0,Country,Population
0,Afghanistan,38972230
1,Albania,2866849
2,Algeria,43451666
3,American Samoa,46189
4,Andorra,77700
...,...,...
229,Wallis and Futuna,11655
230,Western Sahara,556048
231,Yemen,32284046
232,Zambia,18927715


In [164]:
# merge without losing data
df = pd.merge(df, temp, on = 'Country', how = 'outer')

In [165]:
df

Unnamed: 0,Country,CO2,Population
0,Afghanistan,1.785029e+08,38972230.0
1,Africa,4.311757e+10,
2,Albania,2.772782e+08,2866849.0
3,Algeria,4.107870e+09,43451666.0
4,Americas (other),9.864116e+10,
...,...,...,...
253,Tokelau,,1827.0
254,United States Virgin Islands,,100442.0
255,Vatican City,,520.0
256,Wallis and Futuna,,11655.0


In [166]:
df.isnull().sum()

Country        0
CO2           27
Population    24
dtype: int64

In [167]:
temp = pd.read_csv('datasets/gdp.csv')
temp = temp.iloc[:, [0, -1]]
temp.rename(columns = {'Country Name' : 'Country', '2020' : 'Happiness'}, inplace = True)
temp

Unnamed: 0,Country,Happiness
0,Africa Eastern and Southern,9.210000e+11
1,Africa Western and Central,7.850000e+11
2,Australia,1.330000e+12
3,Austria,4.330000e+11
4,Burundi,2.841786e+09
...,...,...
115,St. Vincent and the Grenadines,8.074741e+08
116,World,8.470000e+13
117,South Africa,3.350000e+11
118,Zambia,1.811063e+10


In [168]:
df = pd.merge(df, temp, on = 'Country', how = 'outer')

In [169]:
df

Unnamed: 0,Country,CO2,Population,Happiness
0,Afghanistan,1.785029e+08,38972230.0,
1,Africa,4.311757e+10,,
2,Albania,2.772782e+08,2866849.0,
3,Algeria,4.107870e+09,43451666.0,1.450000e+11
4,Americas (other),9.864116e+10,,
...,...,...,...,...
290,Latin America & the Caribbean (IDA & IBRD coun...,,,4.480000e+12
291,South Asia (IDA & IBRD),,,3.390000e+12
292,Sub-Saharan Africa (IDA & IBRD countries),,,1.710000e+12
293,Upper middle income,,,2.320000e+13


In [170]:
df.describe()

Unnamed: 0,CO2,Population,Happiness
count,231.0,234.0,120.0
mean,17323730000.0,33501070.0,5146832000000.0
std,111542300000.0,135589900.0,12929300000000.0
min,0.0,520.0,807474100.0
25%,34218870.0,415284.5,20524490000.0
50%,273731900.0,5493074.0,258000000000.0
75%,2987929000.0,21447980.0,1702500000000.0
max,1580000000000.0,1424930000.0,84700000000000.0


In [171]:
df.isnull().sum()

Country         0
CO2            64
Population     61
Happiness     175
dtype: int64

In [172]:
# filling null vals with mean

df['Happiness'].fillna(df['Happiness'].mean(), inplace = True)
df['CO2'].fillna(df['CO2'].mean(), inplace = True)
df['Population'].fillna(df['Population'].mean(), inplace = True)

df.isnull().sum()

Country       0
CO2           0
Population    0
Happiness     0
dtype: int64

In [173]:
temp = pd.read_csv('datasets/cars.csv')
temp.shape

(234, 14)

In [174]:
temp.columns

Index(['place', 'pop2023', 'growthRate', 'area', 'country', 'cca3', 'cca2',
       'ccn3', 'region', 'subregion', 'landAreaKm', 'density', 'densityMi',
       'Rank'],
      dtype='object')

In [175]:
temp

Unnamed: 0,place,pop2023,growthRate,area,country,cca3,cca2,ccn3,region,subregion,landAreaKm,density,densityMi,Rank
0,356,1.428628e+09,0.00808,3287590.00,India,IND,IN,356,Asia,"Southern Asia, South Central Asia",2973190.00,480.5033,1244.5036,1
1,156,1.425671e+09,-0.00015,9706961.00,China,CHN,CN,156,Asia,Eastern Asia,9424702.90,151.2696,391.7884,2
2,840,3.399966e+08,0.00505,9372610.00,United States,USA,US,840,North America,Northern America,9147420.00,37.1686,96.2666,3
3,360,2.775341e+08,0.00738,1904569.00,Indonesia,IDN,ID,360,Asia,South-Eastern Asia,1877519.00,147.8196,382.8528,4
4,586,2.404857e+08,0.01976,881912.00,Pakistan,PAK,PK,586,Asia,"Southern Asia, South Central Asia",770880.00,311.9625,807.9829,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,500,4.386000e+03,-0.00091,102.00,Montserrat,MSR,MS,500,North America,Caribbean,102.00,43.0000,111.3700,230
230,238,3.791000e+03,0.00291,12173.00,Falkland Islands,FLK,FK,238,South America,"South America, Latin America",12173.00,0.3114,0.8066,231
231,570,1.935000e+03,0.00052,261.00,Niue,NIU,NU,570,Oceania,Polynesia,261.00,7.4138,19.2017,232
232,772,1.893000e+03,0.01176,12.00,Tokelau,TKL,TK,772,Oceania,Polynesia,10.00,189.3000,490.2870,233


In [176]:
temp.describe()

Unnamed: 0,place,pop2023,growthRate,area,ccn3,landAreaKm,density,densityMi,Rank
count,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0
mean,439.08547,34374420.0,0.009734,581450.0,439.08547,557112.3,451.288182,1168.836388,117.5
std,253.295484,137386400.0,0.012348,1761841.0,253.295484,1689972.0,1979.362419,5126.548664,67.694165
min,4.0,518.0,-0.07448,0.44,4.0,0.44,0.138,0.3574,1.0
25%,223.0,422598.2,0.002302,2650.0,223.0,2625.875,39.74765,102.94645,59.25
50%,439.0,5643895.0,0.00817,81199.5,439.0,75689.25,97.481,252.4758,117.5
75%,659.75,23245370.0,0.01687,430425.8,659.75,404787.6,242.92865,629.18535,175.75
max,894.0,1428628000.0,0.0498,17098240.0,894.0,16376870.0,21402.7052,55433.0064,234.0


In [177]:
temp = temp.iloc[:, [1, 3, 4, 10, 11]]
temp.columns

Index(['pop2023', 'area', 'country', 'landAreaKm', 'density'], dtype='object')

In [178]:
#drop all rows with null values
df.dropna(inplace = True)

In [179]:
df

Unnamed: 0,Country,CO2,Population,Happiness
0,Afghanistan,1.785029e+08,3.897223e+07,5.146832e+12
1,Africa,4.311757e+10,3.350107e+07,5.146832e+12
2,Albania,2.772782e+08,2.866849e+06,5.146832e+12
3,Algeria,4.107870e+09,4.345167e+07,1.450000e+11
4,Americas (other),9.864116e+10,3.350107e+07,5.146832e+12
...,...,...,...,...
290,Latin America & the Caribbean (IDA & IBRD coun...,1.732373e+10,3.350107e+07,4.480000e+12
291,South Asia (IDA & IBRD),1.732373e+10,3.350107e+07,3.390000e+12
292,Sub-Saharan Africa (IDA & IBRD countries),1.732373e+10,3.350107e+07,1.710000e+12
293,Upper middle income,1.732373e+10,3.350107e+07,2.320000e+13


In [180]:
df.dropna(inplace=True)

In [181]:
df

Unnamed: 0,Country,CO2,Population,Happiness
0,Afghanistan,1.785029e+08,3.897223e+07,5.146832e+12
1,Africa,4.311757e+10,3.350107e+07,5.146832e+12
2,Albania,2.772782e+08,2.866849e+06,5.146832e+12
3,Algeria,4.107870e+09,4.345167e+07,1.450000e+11
4,Americas (other),9.864116e+10,3.350107e+07,5.146832e+12
...,...,...,...,...
290,Latin America & the Caribbean (IDA & IBRD coun...,1.732373e+10,3.350107e+07,4.480000e+12
291,South Asia (IDA & IBRD),1.732373e+10,3.350107e+07,3.390000e+12
292,Sub-Saharan Africa (IDA & IBRD countries),1.732373e+10,3.350107e+07,1.710000e+12
293,Upper middle income,1.732373e+10,3.350107e+07,2.320000e+13


In [182]:
life = pd.read_csv('datasets/life-expectancy.csv')

In [183]:
# drop all rows where 'Year' is not 2019
life = life[life['Year'] == 2019]

In [184]:
life.rename(columns={'Entity' : 'Country'},inplace=True)
life

Unnamed: 0,Country,Code,Year,Life expectancy at birth (historical)
69,Afghanistan,AFG,2019,63.6
142,Africa,,2019,62.7
214,Albania,ALB,2019,79.3
289,Algeria,DZA,2019,76.5
361,American Samoa,ASM,2019,72.3
...,...,...,...,...
20148,Western Sahara,ESH,2019,70.3
20226,World,OWID_WRL,2019,72.8
20298,Yemen,YEM,2019,65.1
20370,Zambia,ZMB,2019,62.8


In [185]:
temp = temp.iloc[:, [0,-1]]
temp

Unnamed: 0,pop2023,density
0,1.428628e+09,480.5033
1,1.425671e+09,151.2696
2,3.399966e+08,37.1686
3,2.775341e+08,147.8196
4,2.404857e+08,311.9625
...,...,...
229,4.386000e+03,43.0000
230,3.791000e+03,0.3114
231,1.935000e+03,7.4138
232,1.893000e+03,189.3000


In [186]:
df = pd.merge(df, life, on = 'Country', how = 'outer')
df

Unnamed: 0,Country,CO2,Population,Happiness,Code,Year,Life expectancy at birth (historical)
0,Afghanistan,1.785029e+08,3.897223e+07,5.146832e+12,AFG,2019.0,63.6
1,Africa,4.311757e+10,3.350107e+07,5.146832e+12,,2019.0,62.7
2,Albania,2.772782e+08,2.866849e+06,5.146832e+12,ALB,2019.0,79.3
3,Algeria,4.107870e+09,4.345167e+07,1.450000e+11,DZA,2019.0,76.5
4,Americas (other),9.864116e+10,3.350107e+07,5.146832e+12,,,
...,...,...,...,...,...,...,...
312,Saint Barthlemy,,,,,2019.0,80.1
313,Saint Martin (French part),,,,MAF,2019.0,80.0
314,Small Island Developing States (SIDS),,,,,2019.0,72.4
315,Upper-middle-income countries,,,,,2019.0,76.5


In [187]:
df.dropna(inplace = True)

In [188]:
df

Unnamed: 0,Country,CO2,Population,Happiness,Code,Year,Life expectancy at birth (historical)
0,Afghanistan,1.785029e+08,38972230.0,5.146832e+12,AFG,2019.0,63.6
2,Albania,2.772782e+08,2866849.0,5.146832e+12,ALB,2019.0,79.3
3,Algeria,4.107870e+09,43451666.0,1.450000e+11,DZA,2019.0,76.5
5,Andorra,1.371738e+07,77700.0,5.146832e+12,AND,2019.0,83.0
6,Angola,6.237623e+08,33428485.0,5.146832e+12,AGO,2019.0,62.4
...,...,...,...,...,...,...,...
250,San Marino,1.732373e+10,34007.0,5.146832e+12,SMR,2019.0,83.0
253,Tokelau,1.732373e+10,1827.0,5.146832e+12,TKL,2019.0,75.1
254,United States Virgin Islands,1.732373e+10,100442.0,5.146832e+12,VIR,2019.0,75.1
256,Wallis and Futuna,1.732373e+10,11655.0,5.146832e+12,WLF,2019.0,79.6


In [189]:
suicide = pd.read_csv('datasets/share-deaths-suicide.csv')
suicide

Unnamed: 0,Entity,Code,Year,Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)
0,Afghanistan,AFG,1990,0.38
1,Afghanistan,AFG,1991,0.39
2,Afghanistan,AFG,1992,0.41
3,Afghanistan,AFG,1993,0.42
4,Afghanistan,AFG,1994,0.41
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,1.68
6836,Zimbabwe,ZWE,2016,1.75
6837,Zimbabwe,ZWE,2017,1.80
6838,Zimbabwe,ZWE,2018,1.87


In [190]:
# drop all rows where 'Year' is not 2019
suicide = suicide[suicide['Year'] == 2019]
suicide.rename(columns={'Entity' : 'Country'},inplace=True)
suicide

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suicide.rename(columns={'Entity' : 'Country'},inplace=True)


Unnamed: 0,Country,Code,Year,Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)
29,Afghanistan,AFG,2019,0.64
59,African Region (WHO),,2019,0.90
89,Albania,ALB,2019,0.67
119,Algeria,DZA,2019,0.75
149,American Samoa,ASM,2019,1.03
...,...,...,...,...
6719,World Bank Lower Middle Income,,2019,1.44
6749,World Bank Upper Middle Income,,2019,1.24
6779,Yemen,YEM,2019,0.89
6809,Zambia,ZMB,2019,1.13


In [191]:
suicide = suicide.iloc[:, [0,-1]]
suicide.rename(columns={'Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)' : 'SPM'}, inplace=True)
suicide

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suicide.rename(columns={'Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)' : 'SPM'}, inplace=True)


Unnamed: 0,Country,SPM
29,Afghanistan,0.64
59,African Region (WHO),0.90
89,Albania,0.67
119,Algeria,0.75
149,American Samoa,1.03
...,...,...
6719,World Bank Lower Middle Income,1.44
6749,World Bank Upper Middle Income,1.24
6779,Yemen,0.89
6809,Zambia,1.13


In [192]:
suicide.describe()

Unnamed: 0,SPM
count,228.0
mean,1.302675
std,0.741729
min,0.38
25%,0.89
50%,1.14
75%,1.5325
max,7.03


In [193]:
df.rename(columns={'Life expectancy at birth (historical)' : 'Life Expectancy'}, inplace=True)
df.columns

Index(['Country', 'CO2', 'Population', 'Happiness', 'Code', 'Year',
       'Life Expectancy'],
      dtype='object')

In [194]:
df = pd.merge(df, suicide, on = 'Country', how = 'outer')
df

Unnamed: 0,Country,CO2,Population,Happiness,Code,Year,Life Expectancy,SPM
0,Afghanistan,1.785029e+08,38972230.0,5.146832e+12,AFG,2019.0,63.6,0.64
1,Albania,2.772782e+08,2866849.0,5.146832e+12,ALB,2019.0,79.3,0.67
2,Algeria,4.107870e+09,43451666.0,1.450000e+11,DZA,2019.0,76.5,0.75
3,Andorra,1.371738e+07,77700.0,5.146832e+12,AND,2019.0,83.0,1.31
4,Angola,6.237623e+08,33428485.0,5.146832e+12,AGO,2019.0,62.4,1.04
...,...,...,...,...,...,...,...,...
252,Western Pacific Region (WHO),,,,,,,1.28
253,World Bank High Income,,,,,,,1.53
254,World Bank Low Income,,,,,,,0.95
255,World Bank Lower Middle Income,,,,,,,1.44


In [195]:
df.dropna(inplace=True)

In [196]:
df

Unnamed: 0,Country,CO2,Population,Happiness,Code,Year,Life Expectancy,SPM
0,Afghanistan,1.785029e+08,38972230.0,5.146832e+12,AFG,2019.0,63.6,0.64
1,Albania,2.772782e+08,2866849.0,5.146832e+12,ALB,2019.0,79.3,0.67
2,Algeria,4.107870e+09,43451666.0,1.450000e+11,DZA,2019.0,76.5,0.75
3,Andorra,1.371738e+07,77700.0,5.146832e+12,AND,2019.0,83.0,1.31
4,Angola,6.237623e+08,33428485.0,5.146832e+12,AGO,2019.0,62.4,1.04
...,...,...,...,...,...,...,...,...
226,Northern Mariana Islands,1.732373e+10,49587.0,5.146832e+12,MNP,2019.0,77.1,2.24
227,Puerto Rico,1.732373e+10,3271564.0,1.030000e+11,PRI,2019.0,79.1,0.76
228,San Marino,1.732373e+10,34007.0,5.146832e+12,SMR,2019.0,83.0,1.40
229,Tokelau,1.732373e+10,1827.0,5.146832e+12,TKL,2019.0,75.1,1.18


In [197]:
ap = pd.read_csv('datasets/share-deaths-air-pollution.csv')
ap

Unnamed: 0,Entity,Code,Year,Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: Age-standardized (Percent)
0,Afghanistan,AFG,1990,21.33
1,Afghanistan,AFG,1991,21.01
2,Afghanistan,AFG,1992,21.19
3,Afghanistan,AFG,1993,21.37
4,Afghanistan,AFG,1994,21.09
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,10.98
6836,Zimbabwe,ZWE,2016,11.02
6837,Zimbabwe,ZWE,2017,11.00
6838,Zimbabwe,ZWE,2018,11.06


In [198]:
ap.rename(columns={'Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: Age-standardized (Percent)' : 'Percentage of all deaths caused by air pollution'}, inplace=True)
ap

Unnamed: 0,Entity,Code,Year,Percentage of all deaths caused by air pollution
0,Afghanistan,AFG,1990,21.33
1,Afghanistan,AFG,1991,21.01
2,Afghanistan,AFG,1992,21.19
3,Afghanistan,AFG,1993,21.37
4,Afghanistan,AFG,1994,21.09
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,10.98
6836,Zimbabwe,ZWE,2016,11.02
6837,Zimbabwe,ZWE,2017,11.00
6838,Zimbabwe,ZWE,2018,11.06


In [208]:
ap.rename(columns={'Entity' : 'Country'}, inplace=True)
ap
df = pd.merge(df, ap, on = 'Country', how = 'outer')
df

Unnamed: 0,Country,CO2,Population,Happiness,Code,Year,Life Expectancy,SPM,Percentage of all deaths caused by air pollution
0,Afghanistan,1.785029e+08,38972230.0,5.146832e+12,AFG,2019.0,63.6,0.64,16.61
1,Albania,2.772782e+08,2866849.0,5.146832e+12,ALB,2019.0,79.3,0.67,9.54
2,Algeria,4.107870e+09,43451666.0,1.450000e+11,DZA,2019.0,76.5,0.75,10.73
3,Andorra,1.371738e+07,77700.0,5.146832e+12,AND,2019.0,83.0,1.31,2.17
4,Angola,6.237623e+08,33428485.0,5.146832e+12,AGO,2019.0,62.4,1.04,9.52
...,...,...,...,...,...,...,...,...,...
223,Western Pacific Region (WHO),,,,,,,,14.09
224,World Bank High Income,,,,,,,,3.35
225,World Bank Low Income,,,,,,,,15.97
226,World Bank Lower Middle Income,,,,,,,,15.39
