# Import libraries

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import world_bank_data as wb
from kaleido.scopes.plotly import PlotlyScope

# Load and preprocess data:

In [None]:
CO2_EMISSION_DATA = "EN.ATM.CO2E.KT"
COUNTRY_POPULATION_SIZE_DATA = "SP.POP.TOTL"
POVERTY_HEADCOUNT_RATIO_DATA = "SI.POV.DDAY"
COUNTRY_LAND_AREA_DATA = "AG.LND.TOTL.K2"
COUNTRY_GDP_DATA = "NY.GDP.MKTP.CD"

## CO2 emissions data:

In [None]:
co2_emissions_df = (
    wb.get_series(CO2_EMISSION_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={CO2_EMISSION_DATA: "co2 emissions"})
)

In [None]:
co2_emissions_df.head()

## Country population data:

In [None]:
# year 2019 has the most recent non-missing data
country_population_df = (
    wb.get_series(COUNTRY_POPULATION_SIZE_DATA, simplify_index=True, date="2019")
    .reset_index()
    .rename(columns={COUNTRY_POPULATION_SIZE_DATA: "population size"})
)

In [None]:
country_population_df.head()

## Country poverty percentage data:

In [None]:
poverty_percentages_df = (
    wb.get_series(POVERTY_HEADCOUNT_RATIO_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={POVERTY_HEADCOUNT_RATIO_DATA: "poverty percentage"})
)

In [None]:
poverty_percentages_df.head()

## Country land area data:

In [None]:
# year 2017 has the most recent non-missing data
country_land_area_df = (
    wb.get_series(COUNTRY_LAND_AREA_DATA, simplify_index=True, date="2017")
    .reset_index()
    .rename(columns={COUNTRY_LAND_AREA_DATA: "land area"})
)

In [None]:
country_land_area_df.head()

## Country GDP data:

In [None]:
country_gdp_df = (
    wb.get_series(COUNTRY_GDP_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={COUNTRY_GDP_DATA: "GDP"})
)

In [None]:
country_gdp_df.head()

## Country income group data:

In [None]:
countries_df = wb.get_countries().reset_index()

In [None]:
countries_df.head()

## Create aggregated data:

In [None]:
co2_emissions_agg_df = (
    co2_emissions_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
)

In [None]:
co2_emissions_agg_df.head()

## Merge data into two data frames:

### Merge not-time based statistics:

In [None]:
columns_to_drop = [
    "id",
    "iso2Code",
    "name",
    "adminregion",
    "lendingType",
    "capitalCity",
    "longitude",
    "latitude",
]

In [None]:
overall_stats_df = (
    co2_emissions_agg_df.merge(right=country_land_area_df, on="Country")
    .merge(right=country_population_df, on="Country")
    .merge(right=countries_df, left_on="Country", right_on="name")
    .drop(columns=columns_to_drop)
    .sort_values(by="co2 emissions", ascending=False)
)

In [None]:
overall_stats_df.head()

### Merge time-based data frames:

In [None]:
time_series_df = (
    co2_emissions_df.merge(right=country_gdp_df, on=["Country", "Year"])
    .merge(right=poverty_percentages_df, on=["Country", "Year"])
    .sort_values(by="Year")
)

In [None]:
time_series_df.head()

## Convert `dtyps`:

### `overall_stats_df`:

In [None]:
overall_stats_df.dtypes

### `time_series_df`:

In [None]:
time_series_df.dtypes

In [None]:
time_series_df = time_series_df.astype({"Year": "int"})

In [None]:
time_series_df.dtypes

## Drop non-country rows

In [None]:
non_countries = countries_df.loc[lambda x: x["region"] == "Aggregates"]["name"].values

In [None]:
non_countries = np.append(non_countries, "Latin America & Caribbean")
non_countries = np.append(non_countries, "Sub-Saharan Africa")

### `overall_stats_df`:

In [None]:
overall_stats_df.shape

In [None]:
overall_stats_df.drop(
    index=overall_stats_df[overall_stats_df["Country"].isin(non_countries)].index,
    inplace=True,
)

In [None]:
overall_stats_df.shape

### `time_series_df`:

In [None]:
time_series_df.shape

In [None]:
time_series_df.drop(
    index=time_series_df[time_series_df["Country"].isin(non_countries)].index,
    inplace=True,
)

In [None]:
time_series_df.shape

## Create new statistics:

In [None]:
overall_stats_df.head()

### CO2 emissions per square k.m:

In [None]:
overall_stats_df["emissions per area"] = (
    overall_stats_df["co2 emissions"] / overall_stats_df["land area"]
)

### CO2 emissions per capita:

In [None]:
overall_stats_df["emissions per capita"] = (
    overall_stats_df["co2 emissions"] / overall_stats_df["population size"]
)

# Data visualization:

In [None]:
def get_top_n_countries(n: int, by: str):
    return overall_stats_df.sort_values(by=by, ascending=False)[:n]["Country"].values

## Who emitted the most in the last years?

Let's show the counties which emitted the most at year `2016`

In [None]:
px.bar(
    data_frame=time_series_df.loc[lambda x: x["Year"] == 2016].sort_values(
        by="co2 emissions", ascending=False
    )[:15],
    x="Country",
    y="co2 emissions",
)

## Which countries emitted the most CO2 in total?

Let's visualize emissions throughout history:

In [None]:
fig = px.pie(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
    ],
    names="Country",
    values="co2 emissions",
    color="Country",
    color_discrete_sequence=px.colors.sequential.Reds_r,
)
fig.show()

In [None]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
    ],
    x="Country",
    y="co2 emissions",
    color="co2 emissions",
)

## Treemap chart:

In [None]:
# TODO: add treemap chart

## Emissions per capita:

If a country has more people in genral, then its emissions will be of course higher.

In [None]:
overall_stats_df.head()

In [None]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(
            get_top_n_countries(n=10, by="emissions per capita")
        )
    ].sort_values(by="emissions per capita", ascending=False),
    x="Country",
    y="emissions per capita",
)

## Emissions per country area:

In [None]:
px.bar(
    data_frame=overall_stats_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(n=15, by="emissions per area"))
    ].sort_values(by="emissions per area", ascending=False),
    x="Country",
    y="emissions per area",
)

## How CO2 emissions increased by country

In [None]:
px.line(
    data_frame=time_series_df.loc[
        lambda x: x["Country"].isin(get_top_n_countries(n=7, by="co2 emissions"))
    ],
    x="Year",
    y="co2 emissions",
    line_group="Country",
    color="Country",
)

## How is CO2 emissions related with GDP and poverty:

In [None]:
fig = make_subplots(rows=3, cols=1)


temp_df = data_frame = time_series_df.loc[
    lambda x: x["Country"].isin(get_top_n_countries(n=3, by="co2 emissions"))
]

fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["co2 emissions"]), row=1, col=1)

# fig.add_trace(go.Scatter(x=[1, 2, 3], y=[10, 20, 30]), row=2, col=1)
fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["GDP"]), row=2, col=1)

fig.show()

In [None]:
overall_stats_df.head()

In [None]:
country_gdp_2019_df = country_gdp_df.loc[lambda x: x["Year"] == 2019].sort_values(
    by="GDP", ascending=False
)[["Country", "GDP"]]

In [None]:
overall_stats_df = overall_stats_df.merge(right=country_gdp_2019_df, on="Country")

In [None]:
overall_stats_df.head()

In [None]:
px.scatter(
    data_frame=overall_stats_df[:30],
    x="co2 emissions",
    y="GDP",
    color="incomeLevel",
    symbol="incomeLevel",
)

In [None]:
high_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "High income"
]

In [None]:
px.scatter(
    data_frame=high_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)

In [None]:
upper_middle_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "Upper middle income"
]

In [None]:
upper_middle_income_countries_df.head()

In [None]:
px.scatter(
    data_frame=upper_middle_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)

In [None]:
lower_middle_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "Lower middle income"
]

In [None]:
px.scatter(
    data_frame=lower_middle_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)

In [None]:
low_income_countries_df = overall_stats_df.loc[
    lambda x: x["incomeLevel"] == "Low income"
]

In [None]:
px.scatter(
    data_frame=low_income_countries_df,
    x="co2 emissions",
    y="GDP",
    hover_data=["Country"],
)