In [1]:
import pandas as pd
import numpy as np
import math
import json

In [2]:
# Load oecd data from github repository

url = "https://raw.githubusercontent.com/OpenGeoScales/ogs-data-exploration/oecd/data/ghg-emissions/oecd/raw/AIR_GHG_07042021221914030.csv"
df = pd.read_csv(url)

In [3]:
# Set the dataframe structure
df.sort_values(["Year", "Country", "POL", "Variable"],inplace=True)
df.set_index(["Year", "Country","POL", "Variable"],inplace=True)

In [4]:
# Define the range of years, countries, sectors and pollutants from the data that we will map.
list_years = df.index.get_level_values(0).unique()
list_country = df.index.get_level_values(1).unique()
list_sectors=["Total  emissions excluding LULUCF",
              "Total  emissions including LULUCF",
              "1 - Energy",
              "1A4 - Residential and other sectors",
              "1A1 - Energy Industries",
              "1B - Fugitive Emissions from Fuels",
              "1A2 - Manufacturing industrieslist_country and construction",
              "1A5 - Energy - Other",
              "1A3 - Transport",
              "1C - CO2 from Transport and Storage",
              "2- Industrial processes and product use",
              "3 - Agriculture",
              "Land use, land-use change and forestry (LULUCF)",
              "5 - Waste",
              "6 - Other"]
list_gas = df.index.get_level_values(2).unique()

In [5]:
# Mapping sector
mapped_sectors={}
mapped_sectors["Total  emissions excluding LULUCF"]="total_excluding_LUCF"
mapped_sectors["Total  emissions including LULUCF"]="total_including_LUCF"
mapped_sectors["1 - Energy"]="total_energy"
mapped_sectors["1A4 - Residential and other sectors"]="building"
mapped_sectors["1A1 - Energy Industries"]="electricity_heat"
mapped_sectors["1B - Fugitive Emissions from Fuels"]="fugitive_emissions"
mapped_sectors["1A2 - Manufacturing industries and construction"]="manufacturing_construction"
mapped_sectors["1A5 - Energy - Other"]="other_fuel_combustion"
mapped_sectors["1A3 - Transport"]="transportation"
mapped_sectors["1C - CO2 from Transport and Storage"]="transport_storage"
mapped_sectors["2- Industrial processes and product use"]="industrial_processes"
mapped_sectors["3 - Agriculture"]="agriculture"
mapped_sectors["Land use, land-use change and forestry (LULUCF)"]="lucf"
mapped_sectors["5 - Waste"]="waste"
mapped_sectors["6 - Other"]="other"

In [6]:
# Mapping gas name
mapped_gas={}
mapped_gas["GHG"]="kyotogases"
mapped_gas["CO2"]="CO2"
mapped_gas["CH4"]="CH4"
mapped_gas["N2O"]="N2O"
mapped_gas["HFC"]="HFC"
mapped_gas["HFC_PFC"]="HFC_PFC"
mapped_gas["NF3"]="NF3"
mapped_gas["PFC"]="PFC"
mapped_gas["SF6"]="SF6"

In [7]:
# Define the name for the output mapped datafile.
path="../../../data/ghg-emissions/oecd/mapped/"
data_name="oecd"
namefile = path+"mapped_data_"+data_name+".json"

In [8]:
# Create the output file
file = open(namefile, "w")

i = 0

# Loop over geo_component
for country in list_country:
    
    if country in ["European Union (28 countries)"]:
        scale = "country group"
        id_type="alpha2"
    elif country in ["OECD - Europe", "OECD - Total"]:
        scale = "country group"
        id_type="name"
    else:
        scale = "country"
        id_type="alpha3"
    
    # Loop over years
    for year in list_years:
        
        #Loop over sectors
        for gas in list_gas:
            
            # Loop over gas
            for sector in list_sectors:

                # Check if there is an entry for the given index 
                if not df.index.isin([(year, country, gas, sector)]).any():
                    continue
                    
                # Get the emission value from dataset
                emission_value = df.loc[(year, country, gas, sector),"Value"].values[0]
                
                # Check that the value is not missing
                if math.isnan(emission_value):
                    continue
                                    
                # Create dictionary that will contain the final json object
                dict_mapped_entry={}

                ############## Entries associated with data_source
                dict_data_source={}
                dict_data_source["name"]=data_name    
                dict_data_source["link"]="https://stats.oecd.org/Index.aspx?DataSetCode=AIR_GHG"
                
                # Sub-dictionnary on data source properties
                dict_source_properties={}
                dict_source_properties["description"]=""
                dict_source_properties["provider"]="oecd"
                
                
                dict_data_source["properties"]=dict_source_properties
                
                ############## Entries associated with geo_component
                dict_geo_component={}
                
                # Sub-dictionary geo-component ID
                dict_geo_component_id={}
                dict_geo_component_id["id"]=df.loc[(year, country, gas, sector),"COU"].values[0]
                dict_geo_component_id["type"]=id_type          
                
                # Sub-dictionnary on geo component properties
                dict_geo_properties={}
                dict_geo_properties["data_source_code"]=df.loc[(year, country, gas, sector),"COU"].values[0]
                
                dict_geo_component["scale"]=scale
                dict_geo_component["name"]=country      
                dict_geo_component["identifier"]=dict_geo_component_id
                dict_geo_component["properties"]=dict_geo_properties

                
                ############# Entries associated with emission
                dict_emission={}
                   
                # Sub-dictionnary on unit
                dict_emission_unit={}
                dict_emission_unit["unit_used"]="kT co2 eq"
          
                # Sub-dictionary on sector
                dict_emission_sector={}
                dict_emission_sector["sector_origin_name"]=sector
                dict_emission_sector["sector_mapped_name"]=mapped_sectors[sector]
                
                dict_emission["gas"]=mapped_gas[gas]
                dict_emission["value"]=emission_value
                dict_emission["unit"]=dict_emission_unit
                dict_emission["sector"]=dict_emission_sector   
                
                ############# Combine everything together
                dict_mapped_entry["data_source"]=dict_data_source
                dict_mapped_entry["geo_component"]=dict_geo_component 
                dict_mapped_entry["date"]=str(year)+"-01-01"
                dict_mapped_entry["emission"]=dict_emission

                
                # Write the json object to a file and add a line break (every line is a json object)
                json.dump(dict_mapped_entry, file)
                file.write("\n")
                
    # Print progress
    i += 1
    print("{:02d}/{} - {} done!".format(i, len(list_country), country))
    
file.close()

01/48 - Argentina done!
02/48 - Australia done!
03/48 - Austria done!
04/48 - Belgium done!
05/48 - Brazil done!
06/48 - Canada done!
07/48 - Chile done!
08/48 - Colombia done!
09/48 - Czech Republic done!
10/48 - Denmark done!
11/48 - Estonia done!
12/48 - European Union (28 countries) done!
13/48 - Finland done!
14/48 - France done!
15/48 - Germany done!
16/48 - Greece done!
17/48 - Hungary done!
18/48 - Iceland done!
19/48 - Ireland done!
20/48 - Italy done!
21/48 - Japan done!
22/48 - Korea done!
23/48 - Latvia done!
24/48 - Lithuania done!
25/48 - Luxembourg done!
26/48 - Mexico done!
27/48 - Netherlands done!
28/48 - New Zealand done!
29/48 - Norway done!
30/48 - OECD - Europe done!
31/48 - OECD - Total done!
32/48 - Poland done!
33/48 - Portugal done!
34/48 - Russia done!
35/48 - Slovak Republic done!
36/48 - Slovenia done!
37/48 - South Africa done!
38/48 - Spain done!
39/48 - Sweden done!
40/48 - Switzerland done!
41/48 - Turkey done!
42/48 - United Kingdom done!
43/48 - Unite