In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
# Here we load the dataset "data.csv" generated in  in "cdp-notebook.ipynb"
# It is a concatenation of eeach yearly report with a mapping of columns

data = pd.read_csv("../../../data/ghg-emissions/cdp/cdp_data_all_years.csv", sep=';', parse_dates=["Accounting year start", "Accounting year end"])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2929 entries, 0 to 2928
Data columns (total 48 columns):
 #   Column                                                         Non-Null Count  Dtype         
---  ------                                                         --------------  -----         
 0   Organization                                                   2929 non-null   object        
 1   Account number                                                 2929 non-null   int64         
 2   Country                                                        2929 non-null   object        
 3   City                                                           2433 non-null   object        
 4   C40                                                            256 non-null    object        
 5   Reporting year                                                 2929 non-null   int64         
 6   Accounting year                                                2313 non-null   object        
 7

In [5]:
data.head()

Unnamed: 0,Organization,Account number,Country,City,C40,Reporting year,Accounting year,Primary protocol,Primary protocol comment,Total city-wide emissions,...,Land area (in square km),Population,Population Year,Average altitude (m),Average annual temperature (in Celsius),City GDP,GDP currency,Year of GDP,GDP Source,Last update
0,Kaohsiung City Government,31112,Taiwan,Kaohsiung,,2012,2010,2006 IPCC Guidelines for National Greenhouse G...,The methodology we applied is based on 2006 IP...,63624500.0,...,,,,,,,,,,
1,Taipei City Government,31446,Taiwan,Taipei,,2012,2010,Other: IPCC 1995 GWP,GHG emissions counting in Taipei City contain ...,15500000.0,...,,,,,,,,,,
2,Greater London Authority,3422,United Kingdom,London,C40,2012,2010,Other: Defra's 2009 guidance on how to measure...,The London Energy and Greenhouse Gas Inventory...,43400000.0,...,,,,,,,,,,
3,City of Melbourne,31109,Australia,Melbourne,C40,2012,2009,,Melbourne's methodology and boundary for munic...,4870289.0,...,,,,,,,,,,
4,Bogotá Distrito Capital,31154,Colombia,Bogotá,C40,2012,2011,International Standard for Determining Greenho...,"For Greenhouse Gas Inventory in Bogotá, we use...",15921690.22,...,,,,,,,,,,


# Filtering data
- we keep only one-year emissions


In [6]:
data['Duration'] = pd.to_timedelta(data['Duration'])

In [7]:
data['Duration']

0           NaT
1           NaT
2           NaT
3           NaT
4           NaT
         ...   
2924   364 days
2925        NaT
2926   365 days
2927   364 days
2928        NaT
Name: Duration, Length: 2929, dtype: timedelta64[ns]

In [8]:
data['Duration'].value_counts()

364 days     1311
365 days      432
366 days       16
363 days        6
729 days        3
362 days        3
11 days         2
30 days         2
-1 days         2
361 days        2
367 days        2
1447 days       1
1778 days       1
211 days        1
2555 days       1
334 days        1
700 days        1
2922 days       1
273 days        1
396 days        1
899 days        1
368 days        1
2921 days       1
1095 days       1
0 days          1
Name: Duration, dtype: int64

In [9]:
# we keep on-year emissions
data_to_map = data[
    data['Duration'].isna() | (
    (data['Duration'] > pd.Timedelta(360, unit = 'days')) & (data['Duration'] < pd.Timedelta(370, unit = 'days'))
)
]

In [10]:
data_to_map.reset_index(inplace=True)

In [11]:
data_to_map.shape

(2907, 49)

# Mapping

Questions
- how to put country information (since each entry is a city)
- how to map BASIC and total city wide emissions into scope_X ? For now I leave it as BASIC/BASIC+ scope
- is it the good way for gas (several gases are included in each emission datapoint)??

In [12]:
# Define the name for the output mapped datafile.
path = "../../../data/ghg-emissions/cdp/mapped/"
datasource_name = "cdp"
namefile = path + "mapped_data_" + datasource_name + ".json"

In [13]:
list_scopes = ['Scope 1 generation of grid supplied energy',
'Scope 1 excluding generation of grid supplied energy',
'Scope 2 generation of grid supplied energy',
'Scope 2 excluding generation of grid supplied energy',
'Scope 3 generation of grid supplied energy',
'Scope 3 excluding generation of grid supplied energy',
'Scope 1',
'Scope 2',
'Scope 3',
'TOTAL BASIC emissions (GPC)',
'TOTAL BASIC+ emissions (GPC)',
'Total city-wide emissions']

mapped_scopes = {}
mapped_scopes['Scope 1 generation of grid supplied energy'] = 'scope_1'
mapped_scopes['Scope 1 excluding generation of grid supplied energy'] = 'scope_1'
mapped_scopes['Scope 2 generation of grid supplied energy'] = 'scope_2'
mapped_scopes['Scope 2 excluding generation of grid supplied energy'] = 'scope_2'
mapped_scopes['Scope 3 generation of grid supplied energy'] = 'scope_3'
mapped_scopes['Scope 3 excluding generation of grid supplied energy'] = 'scope_3'
mapped_scopes['Scope 1'] = 'scope_1'
mapped_scopes['Scope 2'] = 'scope_2'
mapped_scopes['Scope 3'] = 'scope_3'
mapped_scopes['TOTAL BASIC emissions (GPC)'] = 'needs to be mapped'
mapped_scopes['TOTAL BASIC+ emissions (GPC)'] = 'needs to be mapped'
mapped_scopes['Total city-wide emissions'] = 'needs to be mapped'

In [15]:
file = open(namefile, "w")

df = data_to_map

nb_rows = len(df)

for row in range(nb_rows):
    for scope in list_scopes:
        emission_value = df.loc[row, scope] 
        
        if not np.isnan(emission_value):
            
            # Create dictionary that will contain the final json object
                dict_mapped_entry = {}

                ############## Entries associated with data_source
                dict_data_source = {}
                dict_data_source["name"] = datasource_name
                dict_data_source["link"] = "https://data.cdp.net/"
                
                # Sub-dictionary data source properties
                dict_data_source_properties = {}
                dict_data_source_properties["scenario"] = df.loc[row, "Primary protocol"]
                dict_data_source["properties"] = dict_data_source_properties
                
                ############## Entries associated with geo_component
                dict_geo_component = {}
                dict_geo_component["scale"] = "city"
                dict_geo_component["name"] = df.loc[row, "City"]
                dict_geo_component["country"] = df.loc[row, "Country"]
                
                # Sub-dictionary geo component properties
                dict_geo_component_properties = {}
                dict_geo_component_properties["geoComponent_datasource_code"] = int(df.loc[row, "Account number"])
                dict_geo_component["properties"] = dict_geo_component_properties
                
                ############## Entries associated with emissions
                dict_emission = {}
                
                # Sub-dictionnary on unit
                dict_emission_unit = {}
                dict_emission_unit["unit_used"] = "MTCO2"
                dict_emission["unit"] = dict_emission_unit
                
                # Sub-dictionnary on scope
                dict_emission_scope = {}
                dict_emission_scope["scope_origin_name"] = scope
                dict_emission_scope["scope_mapped_name"] = mapped_scopes[scope]
                dict_emission["scope"] = dict_emission_scope
                
                
                dict_emission["value"] = float(df.loc[row, scope])
                dict_emission["gas"] = df.loc[row, "Gases included"]
                
            
                
                
                ############# Combine everything together
                dict_mapped_entry["date"] = str(df.loc[row, "Accounting year start"])
                dict_mapped_entry["data_source"] = dict_data_source
                dict_mapped_entry["geo_component"] = dict_geo_component
                dict_mapped_entry["emission"] = dict_emission 
                
                # Write the json object to a file and add a line break (every line is a json object)
                json.dump(dict_mapped_entry, file)
                file.write("\n")
file.close()