In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import math

<h2> Read the file </h2>

In [5]:
# Read datafile corresponding to territorial emissions
df = pd.read_csv("data/CH4/AGS_txt/v50_CH4_1970_AGS.txt",delimiter=";",skiprows=2)

# Set the dataframe structure
#df.sort_values(["Year","Country"],inplace=True)
#df.set_index(["Year","Country"],inplace=True)

# Convert the MtCO2 in MtC (to be in adequation with the values from other dataset of GCP)
#df[["Total","Coal","Oil","Gas","Cement","Flaring","Other","Per Capita"]] = (1.0/3.664)* df[["Total","Coal","Oil","Gas","Cement","Flaring","Other","Per Capita"]]

df.head(3)

Unnamed: 0,lat,lon,emission 1970 (tons)
0,60.4,85.0,1.45699
1,60.2,86.7,4.62809
2,60.2,87.0,3.04254


<h2>Mapping</h2>

This part generates the mapped version of the dataset according to this [model](https://github.com/OpenGeoScales/ogs-connectors/wiki/Data-model). 

<h3>Pre-processing</h3>

Depending on the structure of the dataset, it might be necessary to adapt how the dataset is read below (i.e. add dataset-specific for loops). In the case of GCP, we loop over years, countries and sectors. 

<b>For every emission value, we create a single .json object.</b> 

In [3]:
# Option to select only a subset of countries/year (to check structure)
list_years=[1970]
#list_country=["Afghanistan","Albania"]

# Define the range of years, countries and sectors from the data that we will map (here it is the entire dataset). 
#list_years = df.index.get_level_values(0).unique()
#list_country = df.index.get_level_values(1).unique()
list_sectors=["AGS"]

The original sectors from the dataset need to be mapped to a standardised name of the sector. The current standard sectors are defined in the Excel sheet <code>sectors mapping</code> in the following Excel [document](https://docs.google.com/spreadsheets/d/1CnTpHjZZZepgJ1o1VuQUN61ZaLLRtM1OhZzJU9HCPaY/edit#gid=1017073866).

Similarly, it is important to look at the gas mapping nomenclature that can be found in the Excel sheet <code>gas mapping</code> in the same Excel document.

In [4]:
# Mapping sector
mapped_sectors={}
mapped_sectors["Coal"]="fossil_emissions_coal"
mapped_sectors["Oil"]="fossil_emissions_oil"
mapped_sectors["Gas"]="fossil_emissions_gas"
mapped_sectors["Cement"]="fossil_emissions_cement"
mapped_sectors["Flaring"]="fossil_emissions_flaring"
mapped_sectors["Other"]="fossil_emissions_other"

# Mapping gas name. If multiple gas, look at the Excel sheet for the correspondance.
mapped_gas_name="CO2"

Instructions for the nomenclature of the mapped data filename:
<ul>
    <li>The output mapped data must be stored in a sub-directory <code>mapped</code> in the directory associated with the dataset in <code>data/ghg-emissions/</code></li>
    <li>the name of the mapped data set must be <code>mapped_data_<span style="color:red">DataSourceName</span>.json</code></li> where <code><span style="color:red">DataSourceName</span></code> is the same name used to characterize the directory of the dataset in <code>data/ghg-emissions/</code></li> 
</ul>

<ins>Ex</ins>: For the data of gcp, the mapped datafile is found at <code>data/ghg-emissions/gcp/mapped/mapped_data_gcp.json</code>

In [5]:
# Define the name for the output mapped datafile.
path="../../../data/ghg-emissions/gcp/mapped/"
data_name="gcp"
namefile = path+"mapped_data_"+data_name+".json"

<h3>Main mapping loop</h3> 

In [6]:
# Create the output file
file = open(namefile, "w")

# Loop over geo_component (in this case, countries)
for country in list_country:
    
    # Loop over years
    for yr inn list_years:
        for sect in list_sectors:
            
            # Get the emission value from dataset
            emission_value = df.loc[(yr,country),sect] 
            
            if not math.isnan(emission_value):

                # Create dictionary that will contain the final json object
                dict_mapped_entry={}

                ############## Entries associated with data_source
                dict_data_source={}
                dict_data_source["name"]=data_name    
                dict_data_source["link"]="https://www.globalcarbonproject.org/carbonbudget/20/data.htm"

                ############## Entries associated with geo_component
                dict_geo_component={}
                
                # Sub-dictionary geo-component ID
                dict_geo_component_id={}
                dict_geo_component_id["id"]=df.loc[(2019,country),"ISO 3166-1 alpha-3"]
                dict_geo_component_id["type"]="alpha3"           
                
                dict_geo_component["scale"]="country"
                dict_geo_component["name"]=country      
                dict_geo_component["identifier"]=dict_geo_component_id
                
                ############# Entries associated with emission
                dict_emission={}
                   
                # Sub-dictionnary on unit
                dict_emission_unit={}
                dict_emission_unit["unit_used"]="MtC"
                
                # Sub-dictionary on sector
                dict_emission_sector={}
                dict_emission_sector["sector_origin_name"]=sect
                dict_emission_sector["sector_mapped_name"]=mapped_sectors[sect]
                
                dict_emission["gas"]=mapped_gas_name
                dict_emission["value"]=emission_value
                dict_emission["unit"]=dict_emission_unit
                dict_emission["sector"]=dict_emission_sector         
                
                ############# Combine everything together
                dict_mapped_entry["data_source"]=dict_data_source
                dict_mapped_entry["geo_component"]=dict_geo_component 
                dict_mapped_entry["date"]=str(yr)+"-01-01"
                dict_mapped_entry["emission"]=dict_emission
                 
                # Write the json object to a file and add a line break (every line is a json object)
                json.dump(dict_mapped_entry, file)
                file.write("\n")
file.close()