In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import math

<h2>Mapping</h2>

This part generates the mapped version of the dataset according to this [model](https://github.com/OpenGeoScales/ogs-connectors/wiki/Data-model). 

<h3>Pre-processing</h3>

Depending on the structure of the dataset, it might be necessary to adapt how the dataset is read below (i.e. add dataset-specific for loops). In the case of GCP, we loop over years, countries and sectors. 

<b>For every emission value, we create a single .json object.</b> 

In [37]:
# Option to select only a subset of countries/year (to check structure)
list_years=['1970']

list_sectors=["ENE"]

list_gas=['CH4']

The original sectors from the dataset need to be mapped to a standardised name of the sector. The current standard sectors are defined in the Excel sheet <code>sectors mapping</code> in the following Excel [document](https://docs.google.com/spreadsheets/d/1CnTpHjZZZepgJ1o1VuQUN61ZaLLRtM1OhZzJU9HCPaY/edit#gid=1017073866).

Similarly, it is important to look at the gas mapping nomenclature that can be found in the Excel sheet <code>gas mapping</code> in the same Excel document.

In [22]:
# Mapping sector
mapped_sectors={}
mapped_sectors["AGS"]="agriculture"
mapped_sectors["ENE"]="electricity_heat"

# Mapping gas name. If multiple gas, look at the Excel sheet for the correspondance.
mapped_gas={}
mapped_gas["CH4"]="CH4"
mapped_gas["N2O"]="N2O"
mapped_gas["CO2_org_short-cycle_C"]="CO2"
mapped_gas["CO2_excl_short-cycle_org_C"]="CO2"


Instructions for the nomenclature of the mapped data filename:
<ul>
    <li>The output mapped data must be stored in a sub-directory <code>mapped</code> in the directory associated with the dataset in <code>data/ghg-emissions/</code></li>
    <li>the name of the mapped data set must be <code>mapped_data_<span style="color:red">DataSourceName</span>.json</code></li> where <code><span style="color:red">DataSourceName</span></code> is the same name used to characterize the directory of the dataset in <code>data/ghg-emissions/</code></li> 
</ul>

<ins>Ex</ins>: For the data of gcp, the mapped datafile is found at <code>data/ghg-emissions/gcp/mapped/mapped_data_gcp.json</code>

In [8]:
# Define the name for the output mapped datafile.
path="data/"

<h3>Main mapping loop</h3> 

In [23]:
list_gas = ["CH4","N2O","CO2_org_short-cycle_C"]
list_sectors=["ENE"]
list_years=["2018"]

In [24]:
file=open("mapped_data_edgar.json","w")
# Loop over geo_component (in this case, countries)
for gas in list_gas:
    
    for sect in list_sectors:
        
        # Loop over years
        for yr in list_years:
            namefile=path+gas+"/"+sect+"_txt/v6.0_"+gas+"_"+yr+"_"+sect+".txt"
            df = pd.read_csv(namefile,delimiter=";",skiprows=2)
            df.columns=["lat","lon","emission"]

            for i in range(len(df)):
                lat=df.loc[i,'lat']
                lon=df.loc[i,'lon']
                emission_value=df.loc[i,'emission']

                # Get the emission value from dataset
        
                if not math.isnan(emission_value):

                    # Create dictionary that will contain the final json object
                    dict_mapped_entry={}

                    ############## Entries associated with data_source
                    dict_data_source={}
                    dict_data_source["name"]="edgar"    
                    #dict_data_source["link"]="https://www.globalcarbonproject.org/carbonbudget/20/data.htm"

                    ############## Entries associated with geo_component
                    dict_geo_component={}
                
                    # Sub-dictionary geo-component ID
                    dict_geo_component_id={}

                    dict_geo_component_id["id"]=str(lat)+";"+str(lon)
                    dict_geo_component_id["type"]="position"           
                
                    dict_geo_component["scale"]="grid"
                    #dict_geo_component["name"]=country      
                    dict_geo_component["identifier"]=dict_geo_component_id
                
                    ############# Entries associated with emission
                    dict_emission={}
                   
                    # Sub-dictionnary on unit
                    dict_emission_unit={}
                    dict_emission_unit["unit_used"]="tons"
                
                    # Sub-dictionary on sector
                    dict_emission_sector={}
                    dict_emission_sector["sector_origin_name"]=sect
                    dict_emission_sector["sector_mapped_name"]=mapped_sectors[sect]
                
                    dict_emission["gas"]=mapped_gas[gas]
                    dict_emission["value"]=emission_value
                    dict_emission["unit"]=dict_emission_unit
                    dict_emission["sector"]=dict_emission_sector         
                
                    ############# Combine everything together
                    dict_mapped_entry["data_source"]=dict_data_source
                    dict_mapped_entry["geo_component"]=dict_geo_component 
                    dict_mapped_entry["date"]=str(yr)+"-01-01"
                    dict_mapped_entry["emission"]=dict_emission
                 
                    # Write the json object to a file and add a line break (every line is a json object)
                    json.dump(dict_mapped_entry, file)
                    file.write("\n")
file.close()