# Import libraries:

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import world_bank_data as wb
from helpers import save_figure

# Load and preprocess data:

In [None]:
CO2_EMISSION_DATA = "EN.ATM.CO2E.KT"
COUNTRY_POPULATION_SIZE_DATA = "SP.POP.TOTL"
COUNTRY_LAND_AREA_DATA = "AG.LND.TOTL.K2"
COUNTRY_GDP_DATA = "NY.GDP.MKTP.CD"

## CO2 emissions data:

In [None]:
co2_emissions_df = (
    wb.get_series(CO2_EMISSION_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={CO2_EMISSION_DATA: "co2 emissions"})
)

In [None]:
co2_emissions_df.head()

## Country population data:

In [None]:
country_population_df = (
    wb.get_series(COUNTRY_POPULATION_SIZE_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={COUNTRY_POPULATION_SIZE_DATA: "population size"})
)

In [None]:
country_population_df.head()

## Country land area data:

In [None]:
# year 2017 has the most recent non-missing data
country_land_area_df = (
    wb.get_series(COUNTRY_LAND_AREA_DATA, simplify_index=True, date="2017")
    .reset_index()
    .rename(columns={COUNTRY_LAND_AREA_DATA: "land area"})
)

In [None]:
country_land_area_df.head()

## Country GDP data:

In [None]:
country_gdp_df = (
    wb.get_series(COUNTRY_GDP_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={COUNTRY_GDP_DATA: "GDP"})
)

In [None]:
country_gdp_df.head()

## Country income group data:

In [None]:
countries_df = wb.get_countries().reset_index()

In [None]:
countries_df.head()

## Create aggregated data:

In [None]:
co2_emissions_agg_df = (
    co2_emissions_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
)

In [None]:
co2_emissions_agg_df.head()

## Merge data into two data frames:

### Merge not-time based statistics:

In [None]:
columns_to_drop = [
    "id",
    "iso2Code",
    "name",
    "adminregion",
    "lendingType",
    "capitalCity",
    "longitude",
    "latitude",
]

In [None]:
overall_stats_df = (
    co2_emissions_agg_df.merge(right=country_land_area_df, on="Country")
    .merge(right=countries_df, left_on="Country", right_on="name")
    .drop(columns=columns_to_drop)
    .sort_values(by="co2 emissions", ascending=False)
)

In [None]:
overall_stats_df.head()

### Merge time-based data frames:

In [None]:
time_series_df = (
    co2_emissions_df.merge(right=country_gdp_df, on=["Country", "Year"])
    .merge(right=country_population_df, on=["Country", "Year"])
    .sort_values(by="Year")
)

In [None]:
time_series_df.head()

## Convert `dtyps`:

### `overall_stats_df`:

In [None]:
overall_stats_df.dtypes

### `time_series_df`:

In [None]:
time_series_df.dtypes

In [None]:
time_series_df = time_series_df.astype({"Year": "int"})

In [None]:
time_series_df.dtypes

## Drop non-country rows

In [None]:
non_countries = countries_df.loc[lambda x: x["region"] == "Aggregates"]["name"].values

In [None]:
non_countries = np.append(non_countries, "Latin America & Caribbean")
non_countries = np.append(non_countries, "Sub-Saharan Africa")

### `overall_stats_df`:

In [None]:
overall_stats_df.shape

In [None]:
overall_stats_df.drop(
    index=overall_stats_df[overall_stats_df["Country"].isin(non_countries)].index,
    inplace=True,
)

In [None]:
overall_stats_df.shape

### `time_series_df`:

In [None]:
time_series_df.shape

In [None]:
time_series_df.drop(
    index=time_series_df[time_series_df["Country"].isin(non_countries)].index,
    inplace=True,
)

In [None]:
time_series_df.shape

## Create new statistics:

In [None]:
overall_stats_df.head()

### CO2 emissions per square k.m:

In [None]:
# overall_stats_df["emissions per area"] = (
#     overall_stats_df["co2 emissions"] / overall_stats_df["land area"]
# )

### CO2 emissions per capita:

In [None]:
time_series_df["emissions per capita"] = (
    time_series_df["co2 emissions"] / time_series_df["population size"]
)

# Data visualization:

In [None]:
def get_top_n_countries(n: int, by: str):
    return overall_stats_df.sort_values(by=by, ascending=False)[:n]["Country"].values

## Global emissions throught the years:

### Yearly gloabl emissions:

Let's first take a look at CO2 emissions, and how it increased year by year:

In [None]:
temp_df = (
    time_series_df.groupby(by="Year")
    .agg({"co2 emissions": np.sum})
    .reset_index()
    .sort_values(by="Year")
    .loc[lambda x: x["co2 emissions"] > 0]
)

In [None]:
fig = px.bar(
    data_frame=temp_df,
    x="Year",
    y="co2 emissions",
    title="Total CO2 emissions every year",
    labels={"co2 emissions": "CO2 emissions"},
    color="co2 emissions",
    color_continuous_scale=["#FAA307", "#DC2F02"],
)

fig.show()

In [None]:
fig_name = "total-co2-emissions-every-year"
save_figure(fig, fig_name)

It's obvious that the emissions are increasing every year, not at a constant pace, but the difference between each 10 years is huge.

The emissions are not increasing between each year and the next year, sometimes it's stay the same and sometimes it decreases.

Let's see the difference in total emissions between every consecutive years:

In [None]:
temp_df["change in emissions"] = temp_df["co2 emissions"].diff()

In [None]:
temp_df["change in emissions label"] = "increase"

In [None]:
temp_df.loc[
    lambda x: x["change in emissions"] < 0, "change in emissions label"
] = "decrease"

In [None]:
fig = px.bar(
    data_frame=temp_df,
    x="Year",
    y="change in emissions",
    color="change in emissions label",
    title="Change in CO2 emissions every year",
)

fig.update_layout(legend_title_text="change in emissions")

fig.show()

In [None]:
fig_name = "change-in-co2-emissions-every-year"
save_figure(fig, fig_name)

We can see that at some years, the emissions were actually **decerasing**, or increasing at a low pace, but some other years the increase was huge. at year `1970` the total emissions increased by more than one million, and the same for year `1991`, and year `2010` had the biggest increase by nearly two millions.

### Global emissions every decade:

The previous section showed the total global emissions for every single year, but we can see that there's a huge difference in the total emissions now and in the previous 40 years.

Let's see how the change for every decade (10 years) would look like:

In [None]:
temp_df = (
    time_series_df.groupby(by=time_series_df.Year // 10 * 10)
    .agg({"co2 emissions": np.sum})
    .reset_index()
    .sort_values(by="Year")
    .loc[lambda x: x["co2 emissions"] > 0]
)

In [None]:
fig = px.bar(
    data_frame=temp_df,
    x="Year",
    y="co2 emissions",
    title="Total CO2 emissions every decade",
    labels={"co2 emissions": "CO2 emissions"},
    color="co2 emissions",
    color_continuous_scale=["#FAA307", "#DC2F02"],
)

fig.show()

In [None]:
fig_name = "total-co2-emissions-every-decade"
save_figure(fig, fig_name)

Aggregating the carbon dioxide emissions at the decade level reveal new information: the last decade (2000-2010) had the most global emissions.

Let's study the downward/upward trend in the change of carbon dioxide emissions at the decade level.

In [None]:
temp_df["change in emissions"] = temp_df["co2 emissions"].diff()

In [None]:
temp_df["change in emissions label"] = "increase"

In [None]:
temp_df.loc[
    lambda x: x["change in emissions"] < 0, "change in emissions label"
] = "decrease"

In [None]:
fig = px.bar(
    data_frame=temp_df,
    x="Year",
    y="change in emissions",
    color="change in emissions label",
    title="Change in CO2 emissions every decade",
)

fig.update_layout(legend_title_text="Change in emissions")

fig.show()

In [None]:
fig_name = "change-in-co2-emissions-every-decade"
save_figure(fig, fig_name)

The biggest increase in CO2 emissions was between the years `1970` and `1980`, during the time the total emissions increased by nearly 60 million.

## Which countries emit the most carbon dioxide today:

The previous section showed how the carbon dioxide emissions were increasing rapidly, and how different the increase was between decades.

Here, in this section, we'll focus more at which *countries* were emitting the most in the last decade (year `2010` to `2016`), then in the next section we'll see the historical emissions, and how that will lead us to different conclusions.

**Note**: the data set contains only values up to year `2016`, that's why later years are missing from the analysis.

In [None]:
time_series_df.head()

In [None]:
temp_df = (
    time_series_df[time_series_df.Year >= 2010]
    .groupby(by="Country")
    .agg({"co2 emissions": np.mean})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
    .dropna()
)

In [None]:
temp_df.head()

In [None]:
fig = px.bar(
    data_frame=temp_df[:20],
    x="Country",
    y="co2 emissions",
    color="co2 emissions",
    color_continuous_scale=["#FAA307", "#DC2F02"],
    labels={"co2 emissions": "CO2 emissions"},
    title="Top 20 most emitter countries in the last decade",
)

fig.show()

In [None]:
fig_name = "top-20-most-emitter-countries-in-last-decade"
save_figure(fig, fig_name)

China is in the lead with almost 10 million k.t. of CO2 emissions, the U.S. is behined emitting half of China's emissions.

India, Russia, and Japan emitting more than one million k.t., and the rest 15 countries emitting less than one million k.t. of CO2 emissions.

Let's see the proportional percentages of CO2 emissions for each country alone:

In [None]:
threshold = temp_df.iloc[10, 1]

In [None]:
# label countries emitting less than theshold as Rest of the world
temp_df.loc[lambda x: x["co2 emissions"] <= threshold, "Country"] = "Rest of the world"

In [None]:
fig = px.pie(
    data_frame=temp_df,
    names="Country",
    values="co2 emissions",
    title="Emissions percentages for the top 10 countries in the last decade",
)

fig.show()

Mapping each country's emissions to percentages shows that not only China is emitting the most, it's emitting 30% of total CO2 emissions, followed by the U.S. with 15%.

This chart shows each country's proportion of emissions for *only* the top 10 countries. China's emissions are equal to more than 200 countries' emissions combined, this clearly makes China a big responsible for the climate change.

All this analysis is based on the last decade, so we can't get the full picture of **who's emitting the most**. In the next section, will consider all the historicall emissions: since year `1960` till year `2016`.

In [None]:
fig_name = "Emissions-percentages-for-the-top-10-countries-in-last-decade"
save_figure(fig, fig_name)

## Which countries emitted the most in total:

To get a full pictures of the carbon dioxide emissions, we should analyze emissions data throughout history.

Looking at emissions throughout history will help us understand which countries contributed the most to the climate change.

In [None]:
time_series_df.head()

In [None]:
temp_df = (
    time_series_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
)

In [None]:
fig = px.bar(
    data_frame=temp_df[:20],
    x="Country",
    y="co2 emissions",
    color="co2 emissions",
    color_continuous_scale=["#FAA307", "#DC2F02"],
    labels={"co2 emissions": "CO2 emissions"},
    title="Top 20 most emitter countries throughout history",
)

fig.show()

This chart shows that the `United States` emitted the most carbon dioxide with more than 250 million k.t., `China` in the second place with 180 million k.t.

We can see that most european countries are among the most emitters: `United Kingdom`, `France`, `Germany`. `Italy`, `Poland` and `Spain`.

Even if the european countries only contributed with small amounts of CO2 emissions in the last decade, they have contributed a lot in the last centuries, which makes them among the biggest reponsibles for the climate change. The same goes for the `United States` which have emitted a tremendously large amount of carbon dioxide.

In [None]:
fig_name = "Top 20 most emitter countries throughout history"
save_figure(fig, fig_name)

In [None]:
temp_df.head(11)

In [None]:
threshold = temp_df.iloc[10, 1]

In [None]:
temp_df.loc[lambda x: x["co2 emissions"] <= threshold, "Country"] = "Rest of the world"

In [None]:
fig = px.pie(
    data_frame=temp_df,
    values="co2 emissions",
    names="Country",
    title="Emissions percentages for the top 10 countries throughout history",
)

fig.show()

In [None]:
fig_name = "Emissions-percentages-for-the-top-10-countries-in-the-last-decade"
save_figure(fig, fig_name)

## Comparing countries today's and historical emissions:

In [None]:
# top_10_countries = temp_df[:10]["Country"].values

In [None]:
now_emissions_df = (
    time_series_df[time_series_df.Year >= 2010]
    .groupby(by="Country")
    .agg({"co2 emissions": np.mean})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
    .rename(columns={"co2 emissions": "today's emissions"})
    .dropna()
)

In [None]:
historical_emissions_df = (
    time_series_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
    .rename(columns={"co2 emissions": "historical emissions"})
)

In [None]:
temp_df = pd.merge(
    left=now_emissions_df, right=historical_emissions_df, on="Country"
).sort_values(by=["historical emissions", "today's emissions"], ascending=False)

In [None]:
threshold = temp_df.iloc[10, 2]

In [None]:
temp_df.loc[
    lambda x: x["historical emissions"] <= threshold, "Country"
] = "Rest of the world"

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "domain"}, {"type": "domain"}]])

fig.add_trace(
    go.Pie(
        labels=temp_df["Country"],
        values=temp_df["historical emissions"],
        title="historical emissions",
    ),
    1,
    1,
)

fig.add_trace(
    go.Pie(
        labels=temp_df["Country"],
        values=temp_df["today's emissions"],
        title="today's emissions",
    ),
    1,
    2,
)

fig.update_layout(title_text="Total emissions: now and throughout history")

fig.show()

In [None]:
fig_name = 'today-and-historical-emissions'
save_figure(fig, fig_name)