In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
emissions_data = pd.read_csv("../data/raw/emissions_owid.csv")

In [3]:
emissions_data.dtypes

country                         object
year                             int64
iso_code                        object
population                     float64
gdp                            float64
                                ...   
temperature_change_from_n2o    float64
total_ghg                      float64
total_ghg_excluding_lucf       float64
trade_co2                      float64
trade_co2_share                float64
Length: 79, dtype: object

In [4]:
emissions_data_cleaned = emissions_data[["country", "year", "iso_code", "population", "gdp", "co2_per_capita", "co2_per_gdp", "coal_co2_per_capita",
                                         "energy_per_capita", "energy_per_gdp", "gas_co2_per_capita", "land_use_change_co2_per_capita", "oil_co2_per_capita"]]

In [5]:
emissions_data_cleaned = emissions_data_cleaned.rename(columns={"iso_code" : "country_code"})

In [153]:
emissions_data_cleaned["country_code"].isna().sum()

0

In [7]:
emissions_data_cleaned = emissions_data_cleaned[emissions_data_cleaned["country_code"].notna()]

In [8]:
emissions_data_cleaned["land_use_change_co2_per_capita_lag"] = emissions_data_cleaned.groupby("country")["land_use_change_co2_per_capita"].shift(1)

In [9]:
emissions_data_cleaned[['gdp', 'co2_per_capita', 'co2_per_gdp']].isna().sum()

gdp               27032
co2_per_capita    19491
co2_per_gdp       27590
dtype: int64

In [10]:
emissions_data_cleaned['population'] = emissions_data_cleaned.groupby('country')['population'].transform(lambda x: x.ffill().bfill())

In [11]:
emissions_data_cleaned = emissions_data_cleaned[emissions_data_cleaned["population"].notna()]

In [12]:
emissions_data_cleaned = emissions_data_cleaned.copy()
emissions_data_cleaned["population"] = emissions_data_cleaned["population"].astype(int)

In [13]:
# Carbon Tax is more relevant past 1990:
emissions_data_cleaned = emissions_data_cleaned[(emissions_data_cleaned["year"] >= 1990) & (emissions_data_cleaned["year"] <= 2022)]

In [14]:
emissions_data_cleaned[emissions_data_cleaned["gdp"].isna()]["country"].unique()

array(['Andorra', 'Anguilla', 'Antigua and Barbuda', 'Aruba', 'Bahamas',
       'Belize', 'Bermuda', 'Bhutan', 'Bonaire Sint Eustatius and Saba',
       'British Virgin Islands', 'Brunei', 'Cook Islands', 'Curacao',
       'East Timor', 'Eritrea', 'Faroe Islands', 'Fiji',
       'French Polynesia', 'Greenland', 'Grenada', 'Guyana', 'Kiribati',
       'Liechtenstein', 'Macao', 'Maldives', 'Marshall Islands',
       'Micronesia (country)', 'Monaco', 'Montserrat', 'Nauru',
       'New Caledonia', 'Niue', 'Palau', 'Papua New Guinea',
       'Saint Helena', 'Saint Kitts and Nevis',
       'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines',
       'Samoa', 'San Marino', 'Sint Maarten (Dutch part)',
       'Solomon Islands', 'Somalia', 'South Sudan', 'Sudan', 'Suriname',
       'Tonga', 'Turks and Caicos Islands', 'Tuvalu',
       'United Arab Emirates', 'Vanuatu', 'Vatican', 'Wallis and Futuna'],
      dtype=object)

In [15]:
#interpolate 1991 and 1992 ARE gdp from WB Data
uae_scaling1990 = 40641368064 / 165048348648.546
uae_scaling1993 = 55149662208 / 188567499474.299
uae_scaling1994 = 64654417920 / 205875343585.347
uae_scaling1995 = 75243839488 / 224249304379.684
uae_scaling1996 = 87750959104 / 241595714092.327
years = [1990, 1993, 1994, 1995, 1996]
scaling_factors = [uae_scaling1990, uae_scaling1993, uae_scaling1994, uae_scaling1995, uae_scaling1996]
equation = np.polyfit(years, scaling_factors, 1)
slope = equation[0]
intercept = equation[1]
uae_scaling1991 = slope * 1991 + intercept
uae_scaling1992 = slope * 1992 + intercept

In [16]:
WB_uae_1991 = 172097530003.326
WB_uae_1992 = 181907133314.7
owid_uae_1991 = WB_uae_1991 * uae_scaling1991
owid_uae_1992 = WB_uae_1992 * uae_scaling1992

In [17]:
emissions_data_cleaned = emissions_data_cleaned.copy()

In [18]:
emissions_data_cleaned = emissions_data_cleaned.copy()
emissions_data_cleaned.loc[
    (emissions_data_cleaned["country_code"] == "ARE") &
    (emissions_data_cleaned["year"] == 1991), "gdp"
] = owid_uae_1991

emissions_data_cleaned.loc[
    (emissions_data_cleaned["country_code"] == "ARE") &
    (emissions_data_cleaned["year"] == 1992), "gdp"
] = owid_uae_1992


In [19]:
emissions_data_cleaned = emissions_data_cleaned[emissions_data_cleaned["gdp"].notna()]

In [20]:
emissions_data_cleaned = emissions_data_cleaned[emissions_data_cleaned["co2_per_capita"].notna()]

In [21]:
emissions_data_cleaned["gdp"] = pd.to_numeric(emissions_data_cleaned["gdp"], errors='coerce')

In [22]:
emissions_data_cleaned[[
    "coal_co2_per_capita", "energy_per_capita", "energy_per_gdp", "gas_co2_per_capita", 
    "land_use_change_co2_per_capita_lag", "oil_co2_per_capita"]].isna().sum()

coal_co2_per_capita                   1293
energy_per_capita                      136
energy_per_gdp                         138
gas_co2_per_capita                    1719
land_use_change_co2_per_capita_lag      66
oil_co2_per_capita                       0
dtype: int64

In [23]:
missing_coal = emissions_data_cleaned[emissions_data_cleaned['coal_co2_per_capita'].isna()]
missing_by_country = missing_coal.groupby('country').size()

In [24]:
# Examining data and external energy reports and emission profiles, these countries have minimal coal usage.
# Empty coal emission cells can be filled with 0. 
emissions_data_cleaned["coal_co2_per_capita"].fillna(0, inplace=True)

In [25]:
emissions_data_cleaned["land_use_change_co2_per_capita_lag"].fillna(0, inplace=True)
emissions_data_cleaned["land_use_change_co2_per_capita"].fillna(0, inplace=True)

In [26]:
pd.set_option('display.max_rows', None)
missing_energy = emissions_data_cleaned[emissions_data_cleaned['energy_per_capita'].isna()]
missing_energy.groupby('country').size()

country
Afghanistan                      1
Albania                          1
Angola                           1
Armenia                          3
Bahrain                          1
Barbados                         1
Benin                            1
Bolivia                          1
Bosnia and Herzegovina           3
Botswana                         1
Burkina Faso                     1
Burundi                          1
Cambodia                         1
Cameroon                         1
Cape Verde                       1
Central African Republic         1
Chad                             1
Comoros                          1
Congo                            1
Costa Rica                       1
Cote d'Ivoire                    1
Cuba                             1
Democratic Republic of Congo     1
Djibouti                         1
Dominica                         1
Dominican Republic               1
El Salvador                      1
Equatorial Guinea                1
Eswatini    

In [27]:
pd.reset_option('display.max_rows')

In [28]:
emissions_data_cleaned = emissions_data_cleaned[~(
    ((emissions_data_cleaned["country"].isin(["Montenegro", "Serbia"])) &
     (emissions_data_cleaned["year"] < 2006))
)]

In [29]:
emissions_data_cleaned.loc[:, "energy_per_capita"] = (
    emissions_data_cleaned.groupby("country")["energy_per_capita"]
    .transform(lambda x: x.interpolate())
)

emissions_data_cleaned.loc[:, "energy_per_gdp"] = (
    emissions_data_cleaned.groupby("country")["energy_per_gdp"]
    .transform(lambda x: x.interpolate())
)


In [30]:
emissions_data_cleaned = emissions_data_cleaned.copy()

In [31]:
emissions_data_cleaned["gas_co2_per_capita"] = emissions_data_cleaned["gas_co2_per_capita"].fillna(0)

In [32]:
emissions_data_cleaned["co2_per_capita_minus_5"] = emissions_data_cleaned.groupby('country')["co2_per_capita"].shift(5)
emissions_data_cleaned["co2_per_capita_plus_5"] = emissions_data_cleaned.groupby('country')["co2_per_capita"].shift(-5)

emissions_data_cleaned["co2_per_gdp_minus_5"] = emissions_data_cleaned.groupby('country')["co2_per_gdp"].shift(5)
emissions_data_cleaned["co2_per_gdp_plus_5"] = emissions_data_cleaned.groupby('country')["co2_per_gdp"].shift(-5)

In [33]:
emissions_data_cleaned["co2_per_capita_prior_trend"] = (emissions_data_cleaned["co2_per_capita"] - emissions_data_cleaned["co2_per_capita_minus_5"]) / 5
emissions_data_cleaned["co2_per_capita_future_trend"] = (emissions_data_cleaned["co2_per_capita_plus_5"] - emissions_data_cleaned["co2_per_capita"]) / 5

emissions_data_cleaned["co2_per_gdp_prior_trend"] = (emissions_data_cleaned["co2_per_gdp"] - emissions_data_cleaned["co2_per_gdp_minus_5"]) / 5
emissions_data_cleaned["co2_per_gdp_future_trend"] = (emissions_data_cleaned["co2_per_gdp_plus_5"] - emissions_data_cleaned["co2_per_gdp"]) / 5

In [35]:
emissions_data_cleaned.to_csv("../data/cleaned/emissions_owid_cleaned.csv", index=False, float_format='%.6f')

In [36]:
general_data = pd.read_csv("../data/raw/carbon_tax_wb_general.csv", header = 1)
price_data = pd.read_csv("../data/raw/carbon_tax_wb_prices.csv", header = 1)

In [39]:
general_data.columns

Index(['Unique ID', 'Instrument name', 'Type', 'Status',
       'Jurisdiction covered', 'Share of jurisdiction emissions covered',
       'Price on 1 April', '2020', '2021', '2022', '2023', '2024', '2025',
       'Change', 'Government revenue', '2019', '2020.1', '2021.1', '2022.1',
       '2023.1', '2024.1', 'Change.1', 'Gases covered', 'Electricity and heat',
       'Industry', 'Mining and extractives', 'Transport', 'Aviation',
       'Buildings', 'Agriculture, forestry and fishing fuel use',
       'Agricultural emissions', 'Waste', 'LULUCF', 'Fuels covered',
       'Allocation approaches', 'Price or market management',
       'Point of Regulation', 'Offset eligibility', 'Description',
       'Recent developments', 'Coverage', 'Pricing and allocation',
       'Compliance', 'Relation to other instruments'],
      dtype='object')

In [79]:
general_data_cleaned = general_data[["Unique ID", "Type", "Status", "Jurisdiction covered"]]

In [80]:
general_data_cleaned = general_data_cleaned.rename(columns={"Jurisdiction covered" : "country", "Status" : "status", 
                                                            "Unique ID" : "ID", "Type" : "type"})

In [85]:
general_data_cleaned

Unnamed: 0,ID,type,status,country
0,Tax_AL,Carbon tax,Implemented,Albania
5,Tax_AR,Carbon tax,Implemented,Argentina
7,ETS_AU1,ETS,Abolished,Australia
8,ETS_AU2,ETS,Implemented,Australia
9,ETS_AT,ETS,Implemented,Austria
20,Tax_CA,Carbon tax,Abolished,Canada
21,ETS_CA,ETS,Implemented,Canada
23,Tax_CL,Carbon tax,Implemented,Chile
25,ETS_CN,ETS,Implemented,China
28,Tax_CO,Carbon tax,Implemented,Colombia


In [82]:
general_data_cleaned = general_data_cleaned[(general_data_cleaned["status"] == "Implemented") | (general_data_cleaned["status"] == "Abolished")]

In [83]:
valid_countries = set(emissions_data_cleaned["country"].unique())
valid_countries.add("EU")

In [84]:
general_data_cleaned = general_data_cleaned[general_data_cleaned["country"].isin(valid_countries)]

In [89]:
price_data = price_data.rename(columns={"Unique ID" : "ID"})
price_data.columns

Index(['ID', 'Name of the initiative', 'Instrument Type', 'Region',
       'Income group', 'Metric', '1990', '1991', '1992', '1993', '1994',
       '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022', '2023', '2024', '2025'],
      dtype='object')

In [90]:
price_data_cleaned = price_data.drop(["Name of the initiative", "Instrument Type", "Region", "Income group", "Metric"], axis="columns")

In [96]:
price_data_cleaned = price_data_cleaned.fillna(0) #No tax in that year

In [97]:
general_data_cleaned["ID"] = general_data_cleaned["ID"].astype(str)
price_data_cleaned["ID"] = price_data_cleaned["ID"].astype(str)

In [102]:
carbon_tax_combined_cleaned = pd.merge( 
    general_data_cleaned, price_data_cleaned,
    on="ID", how="left")

In [103]:
carbon_tax_combined_cleaned.to_csv("../data/cleaned/carbon_tax_wb_cleaned.csv", index=False, float_format='%.6f')

In [101]:
eu_countries = [
    "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czech Republic",
    "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary",
    "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta",
    "Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia",
    "Spain", "Sweden"
]

In [110]:
eu_rows = carbon_tax_combined_cleaned[carbon_tax_combined_cleaned['country'] == 'EU']
non_eu_rows = carbon_tax_combined_cleaned[carbon_tax_combined_cleaned['country'] != 'EU']

In [111]:
expanded_eu_rows = pd.concat([
    eu_rows.assign(country=country) for country in eu_countries], ignore_index=True)

In [112]:
expanded_eu_rows

Unnamed: 0,ID,type,status,country,1990,1991,1992,1993,1994,1995,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,ETS_EU,ETS,Implemented,Austria,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
1,ETS_EU,ETS,Implemented,Belgium,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
2,ETS_EU,ETS,Implemented,Bulgaria,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
3,ETS_EU,ETS,Implemented,Croatia,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
4,ETS_EU,ETS,Implemented,Cyprus,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
5,ETS_EU,ETS,Implemented,Czech Republic,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
6,ETS_EU,ETS,Implemented,Denmark,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
7,ETS_EU,ETS,Implemented,Estonia,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
8,ETS_EU,ETS,Implemented,Finland,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124
9,ETS_EU,ETS,Implemented,France,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.26372,24.505716,18.53652,49.779548,86.526108,96.298125,61.301547,70.370124


In [113]:
carbon_tax_combined_cleaned = pd.concat([non_eu_rows, expanded_eu_rows], ignore_index=True)

In [114]:
carbon_tax_combined_cleaned

Unnamed: 0,ID,type,status,country,1990,1991,1992,1993,1994,1995,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Tax_AL,Carbon tax,Implemented,Albania,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,13.111355,13.691905
1,Tax_AR,Carbon tax,Implemented,Argentina,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,8.914348,6.187569,6.342015,5.321599,4.995626,3.218257,0.811554,5.327915
2,ETS_AU1,ETS,Abolished,Australia,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ETS_AU2,ETS,Implemented,Australia,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,21.903750,21.816120
4,ETS_AT,ETS,Implemented,Austria,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,35.343750,48.370500,48.546000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,ETS_EU,ETS,Implemented,Romania,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.263720,24.505716,18.536520,49.779548,86.526108,96.298125,61.301547,70.370124
66,ETS_EU,ETS,Implemented,Slovakia,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.263720,24.505716,18.536520,49.779548,86.526108,96.298125,61.301547,70.370124
67,ETS_EU,ETS,Implemented,Slovenia,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.263720,24.505716,18.536520,49.779548,86.526108,96.298125,61.301547,70.370124
68,ETS_EU,ETS,Implemented,Spain,0.0,0.0,0.0,0.0,0.0,0.0,...,5.921776,5.644848,16.263720,24.505716,18.536520,49.779548,86.526108,96.298125,61.301547,70.370124


In [115]:
carbon_tax_long = carbon_tax_combined_cleaned.melt(
    id_vars = ["country", "type"],
    var_name = "year",
    value_name = "tax_price")

In [120]:
pd.set_option('display.max_rows', None)

In [125]:
carbon_tax_long = carbon_tax_long[(carbon_tax_long["year"] != "ID") & (carbon_tax_long["year"] != "status")]

In [128]:
carbon_tax_long = carbon_tax_long.copy()

In [134]:
carbon_tax_long["year"] = carbon_tax_long["year"].astype(int)
carbon_tax_long = carbon_tax_long[pd.to_numeric(carbon_tax_long["tax_price"], errors="coerce").notnull()]
carbon_tax_long = carbon_tax_long[carbon_tax_long["tax_price"] > 0]
carbon_tax_long["tax_price"] = carbon_tax_long["tax_price"].astype(float)

In [135]:
def get_tax_type(types):
    types_set = set(types)
    if types_set == {"Carbon tax"}:
        return 1
    elif types_set == {"ETS"}:
        return 2
    elif types_set == {"Carbon tax", "ETS"}:
        return 3
    else:
        return 0

In [136]:
carbon_tax_long_cleaned = carbon_tax_long.groupby(["country", "year"]).agg(
    tax_price = ("tax_price", "max"),
    tax_type = ("type", get_tax_type)).reset_index()

In [138]:
carbon_tax_long_cleaned.to_csv("../data/cleaned/carbon_tax_wb_yearly_cleaned.csv", index=False, float_format='%.6f')

In [139]:
combined_data = emissions_data_cleaned.merge(carbon_tax_long_cleaned, on=["country", "year"], how="left")

In [141]:
combined_data["tax_type"] = combined_data["tax_type"].fillna(0).astype(int)
combined_data["tax_price"] = combined_data["tax_price"].fillna(0)

In [143]:
def compute_years_since_tax(group):
    active = group["tax_type"] > 0
    if active.any():
        first_year = group.loc[active, "year"].min()
        group["years_since_tax"] = group["year"] - first_year
        group.loc[group["year"] < first_year, "years_since_tax"] = -1
    else:
        group["years_since_tax"] = -1
    return group

In [144]:
combined_data = combined_data.groupby("country").apply(compute_years_since_tax)

In [145]:
combined_data.to_csv("../data/cleaned/combined_data.csv", index=False, float_format="%.6f")

In [149]:
energy_data = pd.read_csv("../data/raw/energy_owid.csv")

In [150]:
energy_data_cleaned = energy_data[["country", "year", "iso_code", "coal_share_energy", "gas_share_energy", "nuclear_share_energy",
                                   "oil_share_energy", "renewables_share_energy"]]

In [155]:
energy_data_cleaned = energy_data_cleaned.rename(columns={"iso_code" : "country_code"})

In [156]:
energy_data_cleaned = energy_data_cleaned[energy_data_cleaned["country_code"].notna()]

In [157]:
energy_data_cleaned = energy_data_cleaned[(energy_data_cleaned["year"] >= 1990) & (energy_data_cleaned["year"] <= 2022)]

In [161]:
energy_data_cleaned[['coal_share_energy', 'nuclear_share_energy', 'oil_share_energy', "gas_share_energy", "renewables_share_energy"]].isna().sum()

coal_share_energy          3125
nuclear_share_energy       3125
oil_share_energy           3125
gas_share_energy           3125
renewables_share_energy    3125
dtype: int64

In [160]:
energy_data_cleaned = energy_data_cleaned[energy_data_cleaned["country"].isin(valid_countries)]

In [162]:
energy_data_cleaned.to_csv("../data/cleaned/energy_data_cleaned.csv", index=False, float_format="%.6f")

In [169]:
pd.set_option('display.max_rows', None)
missing_coal = energy_data_cleaned[energy_data_cleaned['coal_share_energy'].isna()]
missing_coal.groupby('country').size()

country
Afghanistan                     33
Albania                         33
Angola                          33
Armenia                         31
Bahrain                         33
Barbados                        33
Benin                           33
Bolivia                         33
Bosnia and Herzegovina          31
Botswana                        33
Burkina Faso                    33
Burundi                         33
Cambodia                        33
Cameroon                        33
Cape Verde                      33
Central African Republic        33
Chad                            33
Comoros                         33
Congo                           33
Costa Rica                      33
Cote d'Ivoire                   33
Cuba                            33
Cyprus                          33
Democratic Republic of Congo    33
Djibouti                        33
Dominica                        33
Dominican Republic              33
El Salvador                     33
Equatorial G

In [171]:
energy_data_cleaned["energy_mix_filled"] = energy_data_cleaned[['coal_share_energy', 'nuclear_share_energy', 'oil_share_energy', "gas_share_energy", "renewables_share_energy"]].isna().any(axis=1)
energy_data["energy_mix_filled"] = energy_data_cleaned["energy_mix_filled"].astype(int)