<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#-Read-the-file-" data-toc-modified-id="-Read-the-file--1"><span class="toc-item-num">1&nbsp;&nbsp;</span> Read the file </a></span></li><li><span><a href="#Mapping" data-toc-modified-id="Mapping-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Mapping</a></span><ul class="toc-item"><li><span><a href="#Pre-processing" data-toc-modified-id="Pre-processing-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Pre-processing</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import math
import os

<h2> Read the file </h2>

In [8]:
# Read datafile corresponding to territorial emissions
#df = pd.read_csv("https://github.com/OpenGeoScales/ogs-data-exploration/tree/main/data/ghg-emissions/owid/owid-co2-data.csv", sep = ";")
path = os.getcwd()
subfolder = '/../data/'
df = pd.read_csv(path+subfolder+'owid-co2-data.csv', sep = ";")

In [9]:
df.head()

Unnamed: 0,iso_code,country,year,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
0,AFG,Afghanistan,1949,0.015,,,,,,0.002,...,,,,,,,,,7663783.0,
1,AFG,Afghanistan,1950,0.084,475.0,0.07,,,,0.011,...,,,,,,,,,7752000.0,19494800000.0
2,AFG,Afghanistan,1951,0.092,8.696,0.007,,,,0.012,...,,,,,,,,,7840000.0,20063850000.0
3,AFG,Afghanistan,1952,0.092,,,,,,0.012,...,,,,,,,,,7936000.0,20742350000.0
4,AFG,Afghanistan,1953,0.106,16.0,0.015,,,,0.013,...,,,,,,,,,8040000.0,22015460000.0


In [10]:
# Set the dataframe structure
df.sort_values(["year","country"],inplace=True)
df.set_index(["year","country"],inplace=True)

# Values are expressed in MtCO2 

df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,iso_code,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,consumption_co2_per_capita,share_global_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019,Yemen,YEM,10.255,3.115,0.31,,,,0.352,,0.028,...,,,,,,,,,29162000.0,
2019,Zambia,ZMB,6.72,-3.025,-0.21,,,,0.376,,0.018,...,,,,,,,,,17861000.0,
2019,Zimbabwe,ZWE,10.374,-8.521,-0.966,,,,0.708,,0.028,...,,,,,,,,,14645000.0,


In [11]:
# Select the variables to include in the JSON file
df = df[["iso_code","co2", "trade_co2", "consumption_co2", "cement_co2","coal_co2", "flaring_co2",
                    "gas_co2", "oil_co2", "other_industry_co2",
                    'methane', 'nitrous_oxide', 'total_ghg']]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,iso_code,co2,trade_co2,consumption_co2,cement_co2,coal_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,methane,nitrous_oxide,total_ghg
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1750,EU-28,,9.351,,,,9.351,,,,,,,
1750,Europe,,9.351,,,,9.351,,,,,,,
1750,Europe (excl. EU-27),,9.351,,,,9.351,,,,,,,
1750,United Kingdom,GBR,9.351,,,,9.351,,,,,,,
1750,World,OWID_WRL,9.351,,,,9.351,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Wallis and Futuna Islands,WLF,0.029,,,,,,,0.029,,,,
2019,World,OWID_WRL,36441.388,,,1563.761,14362.167,429.496,7615.714,12355.129,115.121,,,
2019,Yemen,YEM,10.255,,,0.837,0.337,,1.249,7.832,,,,
2019,Zambia,ZMB,6.720,,,1.046,1.878,,,3.797,,,,


In [12]:
df["iso_code"].unique()

array([nan, 'GBR', 'OWID_WRL', 'CAN', 'DEU', 'POL', 'USA', 'BEL', 'FRA',
       'AUT', 'NOR', 'ARM', 'AZE', 'BLR', 'EST', 'GEO', 'HUN', 'KAZ',
       'KGZ', 'LVA', 'LTU', 'MDA', 'RUS', 'ESP', 'TJK', 'TKM', 'UKR',
       'UZB', 'SWE', 'DNK', 'NLD', 'IRL', 'IND', 'ROU', 'CHE', 'AUS',
       'CZE', 'FIN', 'ITA', 'SVK', 'TUR', 'GRC', 'JPN', 'PRT', 'NZL',
       'BGR', 'PER', 'ZAF', 'BIH', 'HRV', 'MNE', 'MKD', 'SRB', 'SVN',
       'ARG', 'IDN', 'MYS', 'MEX', 'VNM', 'CHL', 'TWN', 'CHN', 'BRA',
       'ZWE', 'VEN', 'PRK', 'KOR', 'IRN', 'PHL', 'TTO', 'EGY', 'NGA',
       'DZA', 'TUN', 'ECU', 'COD', 'PRI', 'COL', 'ABW', 'BES', 'SXM',
       'IRQ', 'MOZ', 'BRB', 'BOL', 'MAR', 'MMR', 'ISR', 'LBN', 'SYR',
       'THA', 'URY', 'ALB', 'BHR', 'BRN', 'MDG', 'ISL', 'SAU', 'HKG',
       'ERI', 'CUB', 'ETH', 'GTM', 'NIC', 'LUX', 'BGD', 'KWT', 'PAK',
       'DOM', 'PAN', 'AFG', 'QAT', 'AGO', 'BHS', 'BLZ', 'BMU', 'BDI',
       'CMR', 'CPV', 'CRI', 'CYP', 'DJI', 'SLV', 'GNQ', 'SWZ', 'FRO',
       'FJI', 'GM

In [13]:
# Select only countries (not continents or other entities)
# NB: Kosovo had no official iso_code, and we kept 'OWID_KOS'
df = df.loc[(df["iso_code"].notnull()) & (df["iso_code"] != "OWID_WRL")]

df

Unnamed: 0_level_0,Unnamed: 1_level_0,iso_code,co2,trade_co2,consumption_co2,cement_co2,coal_co2,flaring_co2,gas_co2,oil_co2,other_industry_co2,methane,nitrous_oxide,total_ghg
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1750,United Kingdom,GBR,9.351,,,,9.351,,,,,,,
1751,United Kingdom,GBR,9.351,,,,9.351,,,,,,,
1752,United Kingdom,GBR,9.354,,,,9.354,,,,,,,
1753,United Kingdom,GBR,9.354,,,,9.354,,,,,,,
1754,United Kingdom,GBR,9.358,,,,9.358,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Vietnam,VNM,247.709,,,32.152,135.230,,18.500,61.827,,,,
2019,Wallis and Futuna Islands,WLF,0.029,,,,,,,0.029,,,,
2019,Yemen,YEM,10.255,,,0.837,0.337,,1.249,7.832,,,,
2019,Zambia,ZMB,6.720,,,1.046,1.878,,,3.797,,,,


<h2>Mapping</h2>

This part generates the mapped version of the dataset according to this [model](https://github.com/OpenGeoScales/ogs-connectors/wiki/Data-model). 

<h3>Pre-processing</h3>

The original sectors from the dataset need to be mapped to a standardised name of the sector. The current standard sectors are defined in the Excel sheet <code>sectors mapping</code> in the following Excel [document](https://docs.google.com/spreadsheets/d/1CnTpHjZZZepgJ1o1VuQUN61ZaLLRtM1OhZzJU9HCPaY/edit#gid=1017073866).

Similarly, it is important to look at the gas mapping nomenclature that can be found in the Excel sheet <code>gas mapping</code> in the same Excel document.

In [35]:
# Option to select only a subset of countries/year (to check structure)
#list_country=["Afghanistan","Puerto Rico"]
#list_years=[2014,2015]
list_sectors=["co2", "trade_co2", "consumption_co2", "cement_co2","coal_co2", "flaring_co2",
                    "gas_co2", "oil_co2", "other_industry_co2",
                    'methane', 'nitrous_oxide', 'total_ghg']

# Option to generate the full mapped dataset
list_country = df.index.get_level_values(1).unique()

In [37]:
list_country

Index(['United Kingdom', 'Canada', 'Germany', 'Poland', 'United States',
       'Belgium', 'France', 'Austria', 'Norway', 'Armenia',
       ...
       'Liechtenstein', 'Marshall Islands', 'Namibia', 'Palau', 'Palestine',
       'Turks and Caicos Islands', 'Tuvalu', 'Wallis and Futuna Islands',
       'Timor', 'Kosovo'],
      dtype='object', name='country', length=213)

In [29]:
# Mapping sector
mapped_sectors={}
mapped_sectors["co2"]="total_emissions"
mapped_sectors["trade_co2"]="trade"
mapped_sectors["consumption_co2"]="consumption"
mapped_sectors["cement_co2"]="fossil_emissions_cement"
mapped_sectors["coal_co2"]="fossil_emissions_coal"
mapped_sectors["oil_co2"]="fossil_emissions_oil"
mapped_sectors["gas_co2"]="fossil_emissions_gas"
mapped_sectors["flaring_co2"]="fossil_emissions_flaring"
mapped_sectors["other_industry_co2"]="fossil_emissions_other"
mapped_sectors["methane"]="total_emissions"
mapped_sectors["nitrous_oxide"]="total_emissions"
mapped_sectors["total_ghg"]="total_emissions"

In [16]:
# Mapping gas name
mapped_gas_name={}
mapped_gas_name["co2"]="CO2"
mapped_gas_name["trade_co2"]="CO2"
mapped_gas_name["consumption_co2"]="CO2"
mapped_gas_name["cement_co2"]="CO2"
mapped_gas_name["coal_co2"]="CO2"
mapped_gas_name["oil_co2"]="CO2"
mapped_gas_name["gas_co2"]="CO2"
mapped_gas_name["flaring_co2"]="CO2"
mapped_gas_name["other_industry_co2"]="CO2"
mapped_gas_name["methane"]="CH4"
mapped_gas_name["nitrous_oxide"]="N2O"
mapped_gas_name["total_ghg"]="kyotogases"

In [21]:
# Mapping unit
mapped_units = {}
mapped_units["co2"]="MtC"
mapped_units["trade_co2"]="MtC"
mapped_units["consumption_co2"]="MtC"
mapped_units["cement_co2"]="MtC"
mapped_units["coal_co2"]="MtC"
mapped_units["oil_co2"]="MtC"
mapped_units["gas_co2"]="MtC"
mapped_units["flaring_co2"]="MtC"
mapped_units["other_industry_co2"]="MtC"
mapped_units["methane"]="MtCO2e"
mapped_units["nitrous_oxide"]="MtCO2e"
mapped_units["total_ghg"]="MtCO2e"

Instructions for the nomenclature of the mapped data filename:
<ul>
    <li>The output mapped data must be stored in a sub-directory <code>mapped</code> in the directory associated with the dataset in <code>data/ghg-emissions/</code></li>
    <li>the name of the mapped data set must be <code>mapped_data_<span style="color:red">DataSourceName</span>.json</code></li> where <code><span style="color:red">DataSourceName</span></code> is the same name used to characterize the directory of the dataset in <code>data/ghg-emissions/</code></li> 
</ul>

<ins>Ex</ins>: For the data of gcp, the mapped datafile is found at <code>data/ghg-emissions/gcp/mapped/mapped_data_gcp.json</code>

In [27]:
# Define the name for the output mapped datafile.
data_name="owid"
namefile = "mapped_data_"+data_name+".json"

In [36]:
# Create the output file
file = open(namefile, "w")

# Loop over geo_component (in this case, countries)
for country in list_country:
    
    # Get the list of years for each country
    list_years = df.loc[df.index.get_level_values(1) == country].index.get_level_values(0).unique() 
    yr_ini = list_years[0]
    
    # Loop over years
    for yr in list_years:
        for sect in list_sectors:
            
            # Get the emission value from dataset
            emission_value = df.loc[(yr,country),sect] 
            
            if not math.isnan(emission_value):

                # Create dictionary that will contain the final json object
                dict_mapped_entry={}

                ############## Entries associated with data_source
                dict_data_source={}
                dict_data_source["name"]=data_name    
                dict_data_source["link"]="https://github.com/owid/co2-data"

                ############## Entries associated with geo_component
                dict_geo_component={}
                
                # Sub-dictionary geo-component ID
                dict_geo_component_id={}
                dict_geo_component_id["id"]=df.loc[(yr_ini,country),"iso_code"]
                dict_geo_component_id["type"]="alpha3"           
                
                dict_geo_component["scale"]="country"
                dict_geo_component["name"]=country      
                dict_geo_component["identifier"]=dict_geo_component_id
                
                ############# Entries associated with emission
                dict_emission={}
                   
                # Sub-dictionnary on unit
                dict_emission_unit={}
                dict_emission_unit["unit_used"]=mapped_units[sect]
                
                # Sub-dictionary on sector
                dict_emission_sector={}
                dict_emission_sector["sector_origin_name"]=sect
                dict_emission_sector["sector_mapped_name"]=mapped_sectors[sect]
                
                dict_emission["gas"]=mapped_gas_name[sect]
                dict_emission["value"]=emission_value
                dict_emission["unit"]=dict_emission_unit
                dict_emission["sector"]=dict_emission_sector         
                
                ############# Combine everything together
                dict_mapped_entry["data_source"]=dict_data_source
                dict_mapped_entry["geo_component"]=dict_geo_component 
                dict_mapped_entry["date"]=str(yr)+"-01-01"
                dict_mapped_entry["emission"]=dict_emission
                 
                # Write the json object to a file and add a line break (every line is a json object)
                json.dump(dict_mapped_entry, file)
                file.write("\n")
file.close()