In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install impyute

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
from impyute.imputation.cs import mice
%matplotlib inline

In [None]:
df = pd.read_csv('../input/co2-ghg-emissionsdata/co2_emission.csv')
df.head()

## Data Understanding and transformation
### Exploring data, checking the data types, looking for missing values and renaming/ommiting variables

In [None]:
print(df.dtypes,'\n\n',df.isna().sum())

In [None]:
df.rename(columns = {'Annual CO₂ emissions (tonnes )':'Emissions'},inplace =True)
df.drop(columns = ['Code'],axis =1,inplace = True)
df.head()

## Measuring overall Emissions footprint for the given Entities

In [None]:
temp = df.groupby('Entity').sum().reset_index()

In [None]:
#exploring the data using treemap

fig = px.treemap(temp,path = ['Entity'],values = 'Emissions')
fig.show()

Checking for emission values across the time range

In [None]:
fig = plt.figure(figsize = (15,10))
plt.plot(df.pivot(index = 'Entity',columns = 'Year',values = 'Emissions').isna().sum())
plt.title('Number of countries without data across the years')
#plt.text(df.pivot(index = 'Region',columns = 'Year',values = 'Emissions').isna().sum()[1960],s='1960')

As we can see,data is not available for most of the countires/regions prior to 1960s.
So, lets see the dataframe which contains zero null values

In [None]:
dfPivot = df.pivot(index = 'Entity',columns = 'Year',values = 'Emissions')
dfPivot.loc[(dfPivot.isna().sum(axis = 1) == 0)]

As it turns out,regions of the world (including United States, United Kingdom) have data since 1751. As 'Asia and Pacific (Other)' has same footprint as that of China's, assume China also as a region and adding to the region dataframe

In [None]:
China = dfPivot.query("Entity == 'China'").fillna(0)
RegionDF = dfPivot.loc[(dfPivot.isna().sum(axis = 1) == 0)].append(China)

In [None]:
sb.set_style('whitegrid')
RegionDF.T.plot(figsize = (20,15))

Let's confine our exploration of emissions to top 10 countries from the latest year 2017 and with highest rate of growth rate over the years

In [None]:
CountryDF = dfPivot.loc[(dfPivot.isna().sum(axis =1) != 0)].append(dfPivot.query("Entity == ['China','United States','United Kingdom']"))
CountryDF = CountryDF.loc[:,1960:2017].T
CountryDF

In [None]:
CountryDF.isna().sum().sum()

As many countires still missing values, let's impute the data so that we can calculate growth rates of emissions

In [None]:
imputed = mice(CountryDF.values)
CountryDF = pd.DataFrame(imputed,columns = CountryDF.columns,index = CountryDF.index)
CountryDF.isna().sum().sum()

> ## Creating annual growth rate column taking 1960 as baseline

In [None]:
CountryDF = CountryDF.T

In [None]:
CountryDF['CAGR'] = (CountryDF[2017]-CountryDF[1960])**(1/57)-1
CountryDF

In [None]:
CountryDF['AnnualGrowthRate'] = (((CountryDF[2017]-CountryDF[1960])/CountryDF[1960])*100)/57
Top10Growth = CountryDF.sort_values(by=['AnnualGrowthRate'],ascending = False)[1:10]
Top10Growth

In [None]:
Top10 = CountryDF.sort_values(by=[2017],ascending = False)[1:10]
Top10

In [None]:
Top10.drop(columns = ['AnnualGrowthRate','CAGR']).T.plot(figsize = (18,12),title = 'Emissions over the years')

In [None]:
Top10[2017].plot(kind = 'bar',figsize = (18,12))
plt.title('Emissions in 2017')

In [None]:
np.log10(Top10Growth['AnnualGrowthRate']).plot(kind = 'bar',figsize = (18,12),title = 'Emissions growth since 1960')