# Install Libraries

In [1]:
!pip install plotly
!pip install seaborn
!pip install world_bank_data

Collecting plotly
  Downloading plotly-4.13.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 14.5 MB/s 
[?25hCollecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=aaab3b4a41f8e4264d8f5cc937919d44bf8f647f1214fc8bc28663b9cd1a70ba
  Stored in directory: /home/jovyan/.cache/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.13.0 retrying-1.3.3
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting seaborn
  Downloading seaborn-0.11.0-py3-none-any.whl (283 kB)
[K     |████████████████████████████████| 283 kB 17.1 MB/s 
Installing collected packages: seaborn


# Import libraries

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import world_bank_data as wb
# from kaleido.scopes.plotly import PlotlyScope

# Load data:

In [3]:
CO2_EMISSION_DATA = "EN.ATM.CO2E.KT"
COUNTRY_POPULATION_SIZE_DATA = "SP.POP.TOTL"
POVERTY_HEADCOUNT_RATIO_DATA = "SI.POV.DDAY"
COUNTRY_LAND_AREA_DATA = "AG.LND.TOTL.K2"
COUNTRY_GDP_DATA = "NY.GDP.MKTP.CD"

## CO2 emissions data:

In [4]:
co2_emissions_series = (
    wb.get_series(CO2_EMISSION_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={CO2_EMISSION_DATA: "co2 emissions"})
)

In [5]:
co2_emissions_df = pd.DataFrame(co2_emissions_series)

In [6]:
co2_emissions_df.head()

Unnamed: 0,Country,Year,co2 emissions
0,Arab World,1960,56005.299798
1,Arab World,1961,62578.60559
2,Arab World,1962,70562.050726
3,Arab World,1963,85085.751513
4,Arab World,1964,99693.913771


## Country population data:

In [7]:
# year 2019 has the most recent non-missing data
country_population_series = (
    wb.get_series(COUNTRY_POPULATION_SIZE_DATA, simplify_index=True, date="2019")
    .reset_index()
    .rename(columns={COUNTRY_POPULATION_SIZE_DATA: "population size"})
)

In [8]:
country_population_df = pd.DataFrame(country_population_series)

In [9]:
country_population_df.head()

Unnamed: 0,Country,population size
0,Arab World,427870300.0
1,Caribbean small states,7401381.0
2,Central Europe and the Baltics,102378600.0
3,Early-demographic dividend,3290291000.0
4,East Asia & Pacific,2340628000.0


## Country poverty percentage data:

In [10]:
poverty_percentages_series = (
    wb.get_series(POVERTY_HEADCOUNT_RATIO_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={POVERTY_HEADCOUNT_RATIO_DATA: "poverty percentage"})
)

In [11]:
poverty_percentages_df = pd.DataFrame(poverty_percentages_series)

In [12]:
poverty_percentages_df.head()

Unnamed: 0,Country,Year,poverty percentage
0,Arab World,1960,
1,Arab World,1961,
2,Arab World,1962,
3,Arab World,1963,
4,Arab World,1964,


## Country land area data:

In [13]:
# year 2017 has the most recent non-missing data
country_land_area_series = (
    wb.get_series(COUNTRY_LAND_AREA_DATA, simplify_index=True, date="2017")
    .reset_index()
    .rename(columns={COUNTRY_LAND_AREA_DATA: "land area"})
)

In [14]:
country_land_area_df = pd.DataFrame(country_land_area_series)

In [15]:
country_land_area_df.head()

Unnamed: 0,Country,land area
0,Arab World,11232650.0
1,Caribbean small states,404850.0
2,Central Europe and the Baltics,1105054.0
3,Early-demographic dividend,33107750.0
4,East Asia & Pacific,24396750.0


## Country GDP data:

In [16]:
country_gdp_series = (
    wb.get_series(COUNTRY_GDP_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={COUNTRY_GDP_DATA: "GDP"})
)

In [17]:
country_gdp_df = pd.DataFrame(country_gdp_series)

In [18]:
country_gdp_df.head()

Unnamed: 0,Country,Year,GDP
0,Arab World,1960,
1,Arab World,1961,
2,Arab World,1962,
3,Arab World,1963,
4,Arab World,1964,


## Country income group data:

In [19]:
countries_df = wb.get_countries().reset_index()

In [20]:
countries_df.head()

Unnamed: 0,id,iso2Code,name,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,,High income,Not classified,Oranjestad,-70.0167,12.5167
1,AFG,AF,Afghanistan,South Asia,South Asia,Low income,IDA,Kabul,69.1761,34.5228
2,AFR,A9,Africa,Aggregates,,Aggregates,Aggregates,,,
3,AGO,AO,Angola,Sub-Saharan Africa,Sub-Saharan Africa (excluding high income),Lower middle income,IBRD,Luanda,13.242,-8.81155
4,ALB,AL,Albania,Europe & Central Asia,Europe & Central Asia (excluding high income),Upper middle income,IBRD,Tirane,19.8172,41.3317


# Preprocess data:

## Convert `dtyps`:

### CO2 emissions data:

In [21]:
co2_emissions_df.dtypes

Country           object
Year              object
co2 emissions    float64
dtype: object

In [22]:
co2_emissions_df["Year"] = co2_emissions_df["Year"].astype(int)

In [23]:
co2_emissions_df.dtypes

Country           object
Year               int64
co2 emissions    float64
dtype: object

### Country population data:

In [24]:
country_population_df.dtypes

Country             object
population size    float64
dtype: object

### Country poverty percentage data:

In [25]:
poverty_percentages_series.dtypes

Country                object
Year                   object
poverty percentage    float64
dtype: object

In [26]:
poverty_percentages_df["Year"] = poverty_percentages_df["Year"].astype(int)

In [27]:
poverty_percentages_df.dtypes

Country                object
Year                    int64
poverty percentage    float64
dtype: object

### Country land area data:

In [28]:
country_land_area_df.dtypes

Country       object
land area    float64
dtype: object

### Country GDP data:

In [29]:
country_gdp_df.dtypes

Country     object
Year        object
GDP        float64
dtype: object

In [30]:
country_gdp_df["Year"] = country_gdp_df["Year"].astype(int)

In [31]:
country_gdp_df.dtypes

Country     object
Year         int64
GDP        float64
dtype: object

## Drop non-country rows

In [32]:
non_countries = countries_df.loc[lambda x: x["region"] == "Aggregates"]["name"].values

In [33]:
non_countries = np.append(non_countries, "Latin America & Caribbean")
non_countries = np.append(non_countries, "Sub-Saharan Africa")

In [34]:
for df in [
    co2_emissions_df,
    country_population_df,
    poverty_percentages_df,
    country_land_area_df,
    country_gdp_df,
]:
    print(f"{df.columns[-1]} shape before: {df.shape}")
    df.drop(index=df[df["Country"].isin(non_countries)].index, inplace=True)
    print(f"{df.columns[-1]} shape after: {df.shape}")

co2 emissions shape before: (16104, 3)
co2 emissions shape after: (13237, 3)
population size shape before: (264, 2)
population size shape after: (217, 2)
poverty percentage shape before: (16104, 3)
poverty percentage shape after: (13237, 3)
land area shape before: (264, 2)
land area shape after: (217, 2)
GDP shape before: (16104, 3)
GDP shape after: (13237, 3)


## Create aggregated data:

In [35]:
co2_emissions_agg_df = (
    co2_emissions_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
)

In [36]:
co2_emissions_agg_df.head()

Unnamed: 0,Country,co2 emissions
206,United States,269847800.0
41,China,189678900.0
161,Russian Federation,133252600.0
98,Japan,54256950.0
89,India,43495700.0


## Merge data into two data frames:

### Merge not-time based statistics:

In [37]:
columns_to_drop = [
    "id",
    "iso2Code",
    "name",
    "adminregion",
    "lendingType",
    "capitalCity",
    "longitude",
    "latitude",
]

In [38]:
overall_stats_df = (
    co2_emissions_agg_df.merge(right=country_land_area_df, on="Country")
    .merge(right=country_population_df, on="Country")
    .merge(right=countries_df, left_on="Country", right_on="name")
    .drop(columns=columns_to_drop)
    .sort_values(by="co2 emissions", ascending=False)
)

In [39]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel
0,United States,269847800.0,9147420.0,328239500.0,North America,High income
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income
3,Japan,54256950.0,364560.0,126264900.0,East Asia & Pacific,High income
4,India,43495700.0,2973190.0,1366418000.0,South Asia,Lower middle income


### Merge time-based data frames:

In [40]:
time_series_df = (
    co2_emissions_df.merge(right=country_gdp_df, on=["Country", "Year"])
    .merge(right=poverty_percentages_df, on=["Country", "Year"])
    .sort_values(by="Year")
)

In [41]:
time_series_df.head()

Unnamed: 0,Country,Year,co2 emissions,GDP,poverty percentage
0,Afghanistan,1960,414.371,537777800.0,
9272,Papua New Guinea,1960,179.683,230496000.0,
7015,Lithuania,1960,,,
6222,Kiribati,1960,,,
12383,Ukraine,1960,,,


## Create new statistics:

In [42]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel
0,United States,269847800.0,9147420.0,328239500.0,North America,High income
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income
3,Japan,54256950.0,364560.0,126264900.0,East Asia & Pacific,High income
4,India,43495700.0,2973190.0,1366418000.0,South Asia,Lower middle income


### CO2 emissions per square k.m:

In [43]:
overall_stats_df["emissions per area"] = (
    overall_stats_df["co2 emissions"] / overall_stats_df["land area"]
)

### CO2 emissions per capita:

In [44]:
overall_stats_df["emissions per capita"] = (
    overall_stats_df["co2 emissions"] / overall_stats_df["population size"]
)

# Data visualization:

In [45]:
def get_top_n_countries(n: int, by: str):
    return overall_stats_df.sort_values(by=by, ascending=False)[:n]["Country"].values

## Who emitted the most in the last years?

Let's show the counties which emitted the most at year `2016`

In [46]:
px.bar(
    data_frame=time_series_df.loc[lambda x: x["Year"] == 2016].sort_values(
        by="co2 emissions", ascending=False
    )[:15],
    x="Country",
    y="co2 emissions",
)

## Which countries emitted the most CO2 in total?

Let's visualize emissions throughout history:

In [47]:
fig = px.pie(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
    ],
    names="Country",
    values="co2 emissions",
    color="Country",
    color_discrete_sequence=px.colors.sequential.Reds_r,
)
fig.show()

In [48]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
    ],
    x="Country",
    y="co2 emissions",
    color="co2 emissions",
)

## Treemap chart:

In [49]:
# TODO: add treemap chart

## Emissions per capita:

If a country has more people in genral, then its emissions will be of course higher.

In [50]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel,emissions per area,emissions per capita
0,United States,269847800.0,9147420.0,328239500.0,North America,High income,29.499881,0.822106
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income,20.203943,0.135706
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income,8.136636,0.922971
3,Japan,54256950.0,364560.0,126264900.0,East Asia & Pacific,High income,148.828579,0.429707
4,India,43495700.0,2973190.0,1366418000.0,South Asia,Lower middle income,14.629305,0.031832


In [51]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(
            get_top_n_countries(n=10, by="emissions per capita")
        )
    ].sort_values(by="emissions per capita", ascending=False),
    x="Country",
    y="emissions per capita",
)

## Emissions per country area:

In [52]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(n=15, by="emissions per area"))
    ].sort_values(by="emissions per area", ascending=False),
    x="Country",
    y="emissions per area",
)

## How CO2 emissions increased by country

In [53]:
px.line(
    data_frame=time_series_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(n=7, by="co2 emissions"))
    ],
    x="Year",
    y="co2 emissions",
    line_group="Country",
    color="Country",
)

## How is CO2 emissions related with GDP and poverty:

In [54]:
fig = make_subplots(rows=3, cols=1)


temp_df = data_frame = time_series_df.loc[
    lambda x: x["Country"].isin(get_top_n_countries(n=3, by="co2 emissions"))
]

fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["co2 emissions"]), row=1, col=1)

# fig.add_trace(go.Scatter(x=[1, 2, 3], y=[10, 20, 30]), row=2, col=1)
fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["GDP"]), row=2, col=1)

fig.show()

In [55]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel,emissions per area,emissions per capita
0,United States,269847800.0,9147420.0,328239500.0,North America,High income,29.499881,0.822106
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income,20.203943,0.135706
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income,8.136636,0.922971
3,Japan,54256950.0,364560.0,126264900.0,East Asia & Pacific,High income,148.828579,0.429707
4,India,43495700.0,2973190.0,1366418000.0,South Asia,Lower middle income,14.629305,0.031832


In [56]:
country_gdp_2019_df = country_gdp_df.loc[lambda x: x["Year"] == 2019].sort_values(
    by="GDP", ascending=False
)[["Country", "GDP"]]

In [57]:
overall_stats_df = overall_stats_df.merge(right=country_gdp_2019_df, on="Country")

In [58]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel,emissions per area,emissions per capita,GDP
0,United States,269847800.0,9147420.0,328239500.0,North America,High income,29.499881,0.822106,21374420000000.0
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income,20.203943,0.135706,14342900000000.0
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income,8.136636,0.922971,1699877000000.0
3,Japan,54256950.0,364560.0,126264900.0,East Asia & Pacific,High income,148.828579,0.429707,5081770000000.0
4,India,43495700.0,2973190.0,1366418000.0,South Asia,Lower middle income,14.629305,0.031832,2875142000000.0


In [59]:
px.scatter(
    data_frame=overall_stats_df[:30],
    x="co2 emissions",
    y="GDP",
    color="incomeLevel",
    symbol="incomeLevel",
)

In [60]:
high_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "High income"
]

In [61]:
px.scatter(
    data_frame=high_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)

In [62]:
upper_middle_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "Upper middle income"
]

In [63]:
upper_middle_income_countries_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel,emissions per area,emissions per capita,GDP
1,China,189678900.0,9388210.0,1397715000.0,East Asia & Pacific,Upper middle income,20.203943,0.135706,14342900000000.0
2,Russian Federation,133252600.0,16376870.0,144373500.0,Europe & Central Asia,Upper middle income,8.136636,0.922971,1699877000000.0
11,South Africa,17146690.0,1213090.0,58558270.0,Sub-Saharan Africa,Upper middle income,14.134725,0.292814,351431600000.0
12,Mexico,16821710.0,1943950.0,127575500.0,Latin America & Caribbean,Upper middle income,8.653364,0.131857,1258287000000.0
14,"Iran, Islamic Rep.",15086400.0,1628760.0,82913910.0,Middle East & North Africa,Upper middle income,9.262507,0.181953,


In [64]:
px.scatter(
    data_frame=upper_middle_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)

In [65]:
lower_middle_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "Lower middle income"
]

In [69]:
px.scatter(data_frame=lower_middle_income_countries_df,
          x='co2 emissions',
          y='GDP',
          hover_data=['Country'])

In [67]:
low_income_countries_df = overall_stats_df.loc[lambda x: x['incomeLevel'] == 'Low income']

In [68]:
px.scatter(data_frame=low_income_countries_df,
          x='co2 emissions',
          y='GDP',
          hover_data=['Country'])