# Initial EDA of the Dataset


## Plot the global Temperature Increase and CO2 Emission over the years

In [3]:
import pandas as pd
import plotly.express as px
import altair as alt
import numpy as np
from plotly.subplots import make_subplots

# alt.data_transformers.enable("vegafusion")
df = pd.read_csv("../data/raw/owid-co2-data.csv")
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1851,AFG,3767956.0,,,,,,,...,,0.165,0.0,0.0,0.0,0.0,,,,
2,Afghanistan,1852,AFG,3783940.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
3,Afghanistan,1853,AFG,3800954.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
4,Afghanistan,1854,AFG,3818038.0,,,,,,,...,,0.163,0.0,0.0,0.0,0.0,,,,


In [3]:
# Group by year
df_year = (
    df.groupby("year")
    .aggregate({"co2": "sum", "temperature_change_from_co2": "mean"})
    .reset_index()
)
df_year.head()

Unnamed: 0,year,co2,temperature_change_from_co2
0,1750,55.836,
1,1751,56.442,
2,1752,57.03,
3,1753,57.66,
4,1754,58.404,


In [4]:
# px.line(df_year, x="year", y="co2", title="CO2 emissions over time")
alt.Chart(df_year).mark_line().encode(
    x=alt.X("year", title="Year"), y=alt.Y("co2", title="CO2 Emissions")
).properties(title="Global CO2 emissions over time", width=800, height=400)

In [5]:
alt.Chart(df_year).mark_line().encode(
    x=alt.X("year", title="Year"),
    y=alt.Y("temperature_change_from_co2", title="Temperature Change"),
).properties(title="Global temperature change over time", width=800, height=400)

In [6]:
df_year.shape[0]

273

In [15]:
co2_color = "red"
temp_color = "blue"

start_year = 1850
end_year = 2022
country_codes = None

if country_codes:
    df = df[df.iso_code.isin(country_codes)]
else:
    df = df[df.country == "World"]

df_year = (
    df.query(f"{start_year} <= year <= {end_year}")
    .groupby("year")
    .aggregate({"co2": "sum", "temperature_change_from_co2": "mean"})
    .reset_index()
    .dropna()
)

co2_line = px.line(
    df_year, x="year", y="co2", title="CO2 emissions over time", render_mode="webgl"
)
temp_line = px.line(
    df_year,
    x="year",
    y="temperature_change_from_co2",
    title="Temperature change over time",
    render_mode="webgl",
)
temp_line.update_traces(yaxis="y2")

subfig = make_subplots(specs=[[{"secondary_y": True}]])
subfig.add_traces(co2_line.data + temp_line.data)
subfig.layout.xaxis.title = "Year"
subfig.layout.yaxis.title = "CO2 emissions (kT)"
subfig.layout.yaxis2.title = "Temperature change (Â°C)"
subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))

subfig.show()

In [8]:
df = pd.read_csv("../data/processed/co2-data.csv")
# df.head()
plot_global_temp_co2(df, None, 1850, 2022)

## World Map plot with interactive selection of countries

In [8]:
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1851,AFG,3767956.0,,,,,,,...,,0.165,0.0,0.0,0.0,0.0,,,,
2,Afghanistan,1852,AFG,3783940.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
3,Afghanistan,1853,AFG,3800954.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
4,Afghanistan,1854,AFG,3818038.0,,,,,,,...,,0.163,0.0,0.0,0.0,0.0,,,,


In [9]:
start_year = 2000
end_year = 2022
country_codes = ["USA", "CHN", "IND", "RUS",
                 "DEU", "GBR", "FRA", "BRA", "IDN", "IRN"]

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]

fig = px.choropleth(
    df_filtered,
    locations="iso_code",
    color="co2",
    color_continuous_scale="reds",
    labels={"co2": "CO2 Emissions"},
    hover_name="country",
    scope="world",
)
fig.update_layout(
    title_text="CO2 emissions by country",
    title_font_size=30,
    margin={"r": 0, "t": 55, "l": 0, "b": 0},
)
fig.update_geos(
    resolution=110,
    showcountries=True,
    showland=True,
    landcolor="lightgrey",
    countrycolor="darkgrey",
)
fig.show()

In [10]:
# Get dictionary of country names and codes
country_codes = df.groupby(["iso_code", "country"]).size().reset_index()
country_codes = (
    country_codes[["iso_code", "country"]].set_index("country")[
        "iso_code"].to_dict()
)
country_codes

{'Aruba': 'ABW',
 'Afghanistan': 'AFG',
 'Angola': 'AGO',
 'Anguilla': 'AIA',
 'Albania': 'ALB',
 'Andorra': 'AND',
 'United Arab Emirates': 'ARE',
 'Argentina': 'ARG',
 'Armenia': 'ARM',
 'Antarctica': 'ATA',
 'Antigua and Barbuda': 'ATG',
 'Australia': 'AUS',
 'Austria': 'AUT',
 'Azerbaijan': 'AZE',
 'Burundi': 'BDI',
 'Belgium': 'BEL',
 'Benin': 'BEN',
 'Bonaire Sint Eustatius and Saba': 'BES',
 'Burkina Faso': 'BFA',
 'Bangladesh': 'BGD',
 'Bulgaria': 'BGR',
 'Bahrain': 'BHR',
 'Bahamas': 'BHS',
 'Bosnia and Herzegovina': 'BIH',
 'Belarus': 'BLR',
 'Belize': 'BLZ',
 'Bermuda': 'BMU',
 'Bolivia': 'BOL',
 'Brazil': 'BRA',
 'Barbados': 'BRB',
 'Brunei': 'BRN',
 'Bhutan': 'BTN',
 'Botswana': 'BWA',
 'Central African Republic': 'CAF',
 'Canada': 'CAN',
 'Switzerland': 'CHE',
 'Chile': 'CHL',
 'China': 'CHN',
 "Cote d'Ivoire": 'CIV',
 'Cameroon': 'CMR',
 'Democratic Republic of Congo': 'COD',
 'Congo': 'COG',
 'Cook Islands': 'COK',
 'Colombia': 'COL',
 'Comoros': 'COM',
 'Cape Verde': '

## Plot of bar chart of the top 10 countries with highest CO2 Emission in the time period

In [11]:
start_year = 2000
end_year = 2022
country_codes = None

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
if country_codes:
    df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]
# else:
#     df_filtered = df_filtered[df_filtered.iso_code == "WORLD"]
df_sorted = (
    df_filtered.groupby("country")
    .sum()
    .sort_values("co2", ascending=False)
    .head(11)
    .reset_index()
)
# df_sorted.head().reset_index()

In [12]:
alt.Chart(df_sorted).mark_bar().encode(
    y=alt.Y("country", title="Country").sort("-x"),
    x=alt.X("co2", title="CO2 Emissions"),
    color=alt.Color("country", legend=None),
)

## View the names of the countries in the dataset

In [7]:
df = pd.read_csv("../data/processed/co2-data.csv")
df["iso_code"].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
48053    False
48054    False
48055    False
48056    False
48057    False
Name: iso_code, Length: 48058, dtype: bool

## Total Emission over the years

In [7]:
start_year = 2000
end_year = 2022
country_codes = None

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
if country_codes:
    df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]

total_co2 = df_filtered[["co2"]].sum().values[0]
print(f"Total CO2 emissions: {total_co2:,.0f} kT")

Total CO2 emissions: 4,862,430 kT


In [9]:
# Convert to how volume of CO2 released
# src = https://www.icbe.com/carbondatabase/CO2volumecalculation.asp
ton_to_volume = 556200  # m^3/kt of CO2

# src = https://www.esbnyc.com/sites/default/files/esb_fact_sheet_4_9_14_4.pdf
vol_of_empire_state_building = 1047723.3239  # m^3

number_of_empire_state_buildings = int(
    total_co2 * ton_to_volume / vol_of_empire_state_building
)

print(f"Number of Empire State Buildings: {number_of_empire_state_buildings:,}")

Number of Empire State Buildings: 2,581,295


In [10]:
total_co2 * 1e3 / 1.98 / vol_of_empire_state_building

2343.913609691865