# Initial EDA of the Dataset


## Plot the global Temperature Increase and CO2 Emission over the years

In [20]:
import pandas as pd
import plotly.express as px
import altair as alt
import numpy as np
from plotly.subplots import make_subplots
import sys
sys.path.append('..')

# Now you can import from src/hotspot
from src.hotspot_plot import *

# alt.data_transformers.enable("vegafusion")
df = pd.read_csv("../data/raw/owid-co2-data.csv")
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1851,AFG,3767956.0,,,,,,,...,,0.165,0.0,0.0,0.0,0.0,,,,
2,Afghanistan,1852,AFG,3783940.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
3,Afghanistan,1853,AFG,3800954.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
4,Afghanistan,1854,AFG,3818038.0,,,,,,,...,,0.163,0.0,0.0,0.0,0.0,,,,


In [21]:
import os
print(os.getcwd())

/Users/lirt/Documents/UBC/532/DSCI-532_2024_20_hotspot/notebooks


In [22]:
# Group by year
df_year = (
    df.groupby("year")
    .aggregate({"co2": "sum", "temperature_change_from_co2": "mean"})
    .reset_index()
)
df_year.head()

Unnamed: 0,year,co2,temperature_change_from_co2
0,1750,55.836,
1,1751,56.442,
2,1752,57.03,
3,1753,57.66,
4,1754,58.404,


In [23]:
# px.line(df_year, x="year", y="co2", title="CO2 emissions over time")
alt.Chart(df_year).mark_line().encode(
    x=alt.X("year", title="Year"), y=alt.Y("co2", title="CO2 Emissions")
).properties(title="Global CO2 emissions over time", width=800, height=400)

In [24]:
alt.Chart(df_year).mark_line().encode(
    x=alt.X("year", title="Year"),
    y=alt.Y("temperature_change_from_co2", title="Temperature Change"),
).properties(title="Global temperature change over time", width=800, height=400)

In [25]:
df_year.shape[0]

273

In [26]:
co2_color = "red"
temp_color = "blue"

start_year = 1850
end_year = 2022
country_codes = None

if country_codes:
    df = df[df.iso_code.isin(country_codes)]
else:
    df = df[df.country == "World"]

df_year = (
    df.query(f"{start_year} <= year <= {end_year}")
    .groupby("year")
    .aggregate({"co2": "sum", "temperature_change_from_co2": "mean"})
    .reset_index()
    .dropna()
)

co2_line = px.line(
    df_year, x="year", y="co2", title="CO2 emissions over time", render_mode="webgl"
)
temp_line = px.line(
    df_year,
    x="year",
    y="temperature_change_from_co2",
    title="Temperature change over time",
    render_mode="webgl",
)
temp_line.update_traces(yaxis="y2")

subfig = make_subplots(specs=[[{"secondary_y": True}]])
subfig.add_traces(co2_line.data + temp_line.data)
subfig.layout.xaxis.title = "Year"
subfig.layout.yaxis.title = "CO2 emissions (kT)"
subfig.layout.yaxis2.title = "Temperature change (°C)"
subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))

subfig.show()

In [27]:
df = pd.read_csv("../data/processed/co2-data.csv")
# df.head()
plot_global_temp_co2(df, 1900, 2022)

{'$schema': 'https://vega.github.io/schema/vega/v5.json',
 'data': [{'name': 'source_0',
   'values': [{'co2': 1952.21,
     'temperature_change_from_co2': 0.0003425925925925926,
     'year': 1900},
    {'co2': 2016.7450000000001,
     'temperature_change_from_co2': 0.0003564814814814815,
     'year': 1901},
    {'co2': 2067.683,
     'temperature_change_from_co2': 0.00037962962962962966,
     'year': 1902},
    {'co2': 2254.099,
     'temperature_change_from_co2': 0.0003935185185185185,
     'year': 1903},
    {'co2': 2279.75,
     'temperature_change_from_co2': 0.0004027777777777777,
     'year': 1904},
    {'co2': 2427.317,
     'temperature_change_from_co2': 0.0004259259259259259,
     'year': 1905},
    {'co2': 2534.733,
     'temperature_change_from_co2': 0.0004398148148148148,
     'year': 1906},
    {'co2': 2888.314,
     'temperature_change_from_co2': 0.000462962962962963,
     'year': 1907},
    {'co2': 2776.4410000000003,
     'temperature_change_from_co2': 0.000467592592592

## World Map plot with interactive selection of countries

In [28]:
df.head()

Unnamed: 0,country,iso_code,year,co2,temperature_change_from_co2,co2_per_capita
0,Afghanistan,AFG,1900,,0.0,
1,Afghanistan,AFG,1901,,0.0,
2,Afghanistan,AFG,1902,,0.0,
3,Afghanistan,AFG,1903,,0.0,
4,Afghanistan,AFG,1904,,0.0,


In [29]:
start_year = 2000
end_year = 2022
country_codes = ["USA", "CHN", "IND", "RUS",
                 "DEU", "GBR", "FRA", "BRA", "IDN", "IRN"]

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]

fig = px.choropleth(
    df_filtered,
    locations="iso_code",
    color="co2",
    color_continuous_scale="reds",
    labels={"co2": "CO2 Emissions"},
    hover_name="country",
    scope="world",
)
fig.update_layout(
    title_text="CO2 emissions by country",
    title_font_size=30,
    margin={"r": 0, "t": 55, "l": 0, "b": 0},
)
fig.update_geos(
    resolution=110,
    showcountries=True,
    showland=True,
    landcolor="lightgrey",
    countrycolor="darkgrey",
)
fig.show()

In [30]:
# Get dictionary of country names and codes
country_codes = df.groupby(["iso_code", "country"]).size().reset_index()
country_codes = (
    country_codes[["iso_code", "country"]].set_index("country")[
        "iso_code"].to_dict()
)
country_codes

{'Aruba': 'ABW',
 'Afghanistan': 'AFG',
 'Angola': 'AGO',
 'Anguilla': 'AIA',
 'Albania': 'ALB',
 'Andorra': 'AND',
 'United Arab Emirates': 'ARE',
 'Argentina': 'ARG',
 'Armenia': 'ARM',
 'Antarctica': 'ATA',
 'Antigua and Barbuda': 'ATG',
 'Australia': 'AUS',
 'Austria': 'AUT',
 'Azerbaijan': 'AZE',
 'Burundi': 'BDI',
 'Belgium': 'BEL',
 'Benin': 'BEN',
 'Bonaire Sint Eustatius and Saba': 'BES',
 'Burkina Faso': 'BFA',
 'Bangladesh': 'BGD',
 'Bulgaria': 'BGR',
 'Bahrain': 'BHR',
 'Bahamas': 'BHS',
 'Bosnia and Herzegovina': 'BIH',
 'Belarus': 'BLR',
 'Belize': 'BLZ',
 'Bermuda': 'BMU',
 'Bolivia': 'BOL',
 'Brazil': 'BRA',
 'Barbados': 'BRB',
 'Brunei': 'BRN',
 'Bhutan': 'BTN',
 'Botswana': 'BWA',
 'Central African Republic': 'CAF',
 'Canada': 'CAN',
 'Switzerland': 'CHE',
 'Chile': 'CHL',
 'China': 'CHN',
 "Cote d'Ivoire": 'CIV',
 'Cameroon': 'CMR',
 'Democratic Republic of Congo': 'COD',
 'Congo': 'COG',
 'Cook Islands': 'COK',
 'Colombia': 'COL',
 'Comoros': 'COM',
 'Cape Verde': '

## Plot of bar chart of the top 10 countries with highest CO2 Emission in the time period

In [31]:
start_year = 2000
end_year = 2022
country_codes = None

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
if country_codes:
    df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]
# else:
#     df_filtered = df_filtered[df_filtered.iso_code == "WORLD"]
df_sorted = (
    df_filtered.groupby("country")
    .sum()
    .sort_values("co2", ascending=False)
    .head(11)
    .reset_index()
)
# df_sorted.head().reset_index()

In [32]:
alt.Chart(df_sorted).mark_bar().encode(
    y=alt.Y("country", title="Country").sort("-x"),
    x=alt.X("co2", title="CO2 Emissions"),
    color=alt.Color("country", legend=None),
)

In [33]:
df_sorted_co2_per_capita = (
    df_filtered.groupby("country")
    .sum()
    .sort_values("co2_per_capita", ascending=False)
    .head(11)
    .reset_index()
)

In [34]:
alt.Chart(df_sorted_co2_per_capita).mark_bar().encode(
    y=alt.Y("country", title="Country").sort("-x"),
    x=alt.X("co2_per_capita", title="CO2 Emissions per Capita"),
    color=alt.Color("country", legend=None),
)

## View the names of the countries in the dataset

In [35]:
df = pd.read_csv("../data/processed/co2-data.csv")
df["iso_code"].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
26929    False
26930    False
26931    False
26932    False
26933    False
Name: iso_code, Length: 26934, dtype: bool

## Total Emission over the years

In [36]:
start_year = 2000
end_year = 2022
country_codes = None

df_filtered = df.query(f"{start_year} <= year <= {end_year}")
if country_codes:
    df_filtered = df_filtered[df_filtered.iso_code.isin(country_codes)]

total_co2 = df_filtered[["co2"]].sum().values[0]
print(f"Total CO2 emissions: {total_co2:,.0f} kT")

Total CO2 emissions: 728,341 kT


In [37]:
# Convert to how volume of CO2 released
# src = https://www.icbe.com/carbondatabase/CO2volumecalculation.asp
ton_to_volume = 556200  # m^3/kt of CO2

# src = https://www.esbnyc.com/sites/default/files/esb_fact_sheet_4_9_14_4.pdf
vol_of_empire_state_building = 1047723.3239  # m^3

number_of_empire_state_buildings = int(
    total_co2 * ton_to_volume / vol_of_empire_state_building
)

print(f"Number of Empire State Buildings: {number_of_empire_state_buildings:,}")

Number of Empire State Buildings: 386,650


In [38]:
total_co2 * 1e3 / 1.98 / vol_of_empire_state_building

351.0935045008723

## Test the function

In [39]:
def filter_data(df, country_codes, start_year=1900, end_year=2022):
    """
    Filters the data based on the selected countries and year range.
    """
    if country_codes:
        df = df[df.iso_code.isin(country_codes)]
    return df.query(f"{start_year} <= year <= {end_year}")

In [40]:
df = pd.read_csv("../data/processed/co2-data.csv")
df_filtered = filter_data(
    df, ["USA", "CHN", "IND", "RUS", "DEU", "GBR", "FRA", "BRA", "IDN", "IRN"]
)
df_filtered.head()
# df_filtered = df_filtered.groupby(["iso_code"]).sum().reset_index()

Unnamed: 0,country,iso_code,year,co2,temperature_change_from_co2,co2_per_capita
3443,Brazil,BRA,1900,,0.001,
3444,Brazil,BRA,1901,2.103,0.001,0.114
3445,Brazil,BRA,1902,2.506,0.001,0.133
3446,Brazil,BRA,1903,2.44,0.001,0.127
3447,Brazil,BRA,1904,2.62,0.001,0.133


In [41]:
df_filtered.groupby(["iso_code"]).sum("co2").reset_index()

Unnamed: 0,iso_code,year,co2,temperature_change_from_co2,co2_per_capita
0,BRA,241203,17176.783,2.084,113.538
1,CHN,241203,260619.243,3.151,210.337
2,DEU,241203,87921.775,2.335,1175.671
3,FRA,241203,35816.506,0.908,711.947
4,GBR,241203,62504.298,2.41,1190.535
5,IDN,241203,15705.843,1.827,80.953
6,IND,241203,59637.621,1.314,59.744
7,IRN,241203,19852.75,0.234,368.487
8,RUS,241203,118859.749,4.102,881.414
9,USA,241203,416912.845,13.673,2101.363


In [42]:
df_filtered.groupby("iso_code", as_index=False).sum()

Unnamed: 0,iso_code,country,year,co2,temperature_change_from_co2,co2_per_capita
0,BRA,BrazilBrazilBrazilBrazilBrazilBrazilBrazilBraz...,241203,17176.783,2.084,113.538
1,CHN,ChinaChinaChinaChinaChinaChinaChinaChinaChinaC...,241203,260619.243,3.151,210.337
2,DEU,GermanyGermanyGermanyGermanyGermanyGermanyGerm...,241203,87921.775,2.335,1175.671
3,FRA,FranceFranceFranceFranceFranceFranceFranceFran...,241203,35816.506,0.908,711.947
4,GBR,United KingdomUnited KingdomUnited KingdomUnit...,241203,62504.298,2.41,1190.535
5,IDN,IndonesiaIndonesiaIndonesiaIndonesiaIndonesiaI...,241203,15705.843,1.827,80.953
6,IND,IndiaIndiaIndiaIndiaIndiaIndiaIndiaIndiaIndiaI...,241203,59637.621,1.314,59.744
7,IRN,IranIranIranIranIranIranIranIranIranIranIranIr...,241203,19852.75,0.234,368.487
8,RUS,RussiaRussiaRussiaRussiaRussiaRussiaRussiaRuss...,241203,118859.749,4.102,881.414
9,USA,United StatesUnited StatesUnited StatesUnited ...,241203,416912.845,13.673,2101.363
