In [46]:
import pymongo
import pandas as pd
import os
import json
from tqdm import tqdm
import logging

logger = logging.getLogger('test_emissions_integration')
logger.setLevel('INFO')

In [47]:
catalog.list()

['data_sources',
 'ademe_assessments',
 'ademe_emissions',
 'ademe_legal_units',
 'ademe_scope_items',
 'ademe_texts',
 'ademe_merged',
 'gcp',
 'parameters']

## MongoDB / Collection parameters

In [29]:
DATABASE_NAME = 'ogs'
GEO_COMPONENTS_COLLECTION_NAME = 'geo_components'
DATA_SOURCES_COLLECTION_NAME = 'data_sources'
EMISSIONS_COLLECTION_NAME = 'emissions'

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client.get_database(DATABASE_NAME)

In [30]:
geo_components_collection = db[GEO_COMPONENTS_COLLECTION_NAME]
data_sources_collection = db[DATA_SOURCES_COLLECTION_NAME]
emissions_collection = db[EMISSIONS_COLLECTION_NAME]

In [31]:
# db.drop_collection(EMISSIONS_COLLECTION_NAME)

## Helper functions

In [32]:
def get_geo_components():
    return list(geo_components_collection.find({}))

def get_data_sources():
    return list(data_sources_collection.find({}))

def cast_json(line):
    try:
        json_sample = json.loads(line)
    except:
        logging.error('Failed to parse to json:', line)
        return {}
    return json_sample

## Core functions

In [33]:
def find_geo_component(geo_component, ref_geo_components):
    """
    Given a geo_component, return its id in the referential, None if not found
    """
    # Get geo_component identifier type
    identifier_type = geo_component['identifier']['type']
    
    for ref_geo_component in ref_geo_components:
        # Check whether or not identifier is known for this ref_geo_component
        if identifier_type not in ref_geo_component['identifiers']:
            continue
            
        if ref_geo_component['identifiers'][identifier_type] == geo_component['identifier']['id']:
            return ref_geo_component['_id']
        
    return None   


def find_data_source(data_source, ref_data_sources):
    """
    Given a data_source, return its id in the referential, None if not found
    """
    for ref_data_source in ref_data_sources:
        if data_source['name'] == ref_data_source['name']:
            return ref_data_source['_id']
    
    return None

In [34]:
class InsertEmissionError(Exception):
    pass


def insert_emission(emission, ref_geo_components, ref_data_sources):
    """
    Given an emission and referentials, try to insert the emission
    """
    # Get geo_component id
    geo_component_id = find_geo_component(
        geo_component=emission['geo_component'],
        ref_geo_components=ref_geo_components
    )
    
    if not geo_component_id:
        raise InsertEmissionError('geo_component not found: %s' % emission['geo_component'])
        
    # Get data_source id
    data_source_id = find_data_source(
        data_source=emission['data_source'],
        ref_data_sources=ref_data_sources
    )
    
    if not data_source_id:
        raise InsertEmissionError('data_source not found: %s' % emission['data_source'])
        
    # Format document
    emission_document = {
        'geo_component_id': geo_component_id,
        'data_source_id': data_source_id,
        'date': emission['date'],
        'gas': emission['emission']['gas'],
        'value': emission['emission']['value'],
        'unit': emission['emission']['unit'],
        'sector': emission['emission']['sector_name'],
        'subsector': emission['emission']['subsector_name']        
    }
    
    emissions_collection.insert_one(emission_document)
    

def insert_emissions(emissions):
    """
    Given a list of emissions, insert them into the collection
    """
    ref_geo_components = get_geo_components()
    ref_data_sources = get_data_sources()
    
    nb_inserted = 0
    nb_failed = 0
    
    for emission in tqdm(emissions):
        try:
            insert_emission(emission, ref_geo_components, ref_data_sources)
            nb_inserted += 1
        except InsertEmissionError as e:
            logger.error('Failed to insert emission: %s' % e)
            nb_failed += 1
            
    logger.info('Succesfully inserted %s emissions.' % nb_inserted)
    if nb_failed:
        logger.warning('Failed to insert %s emissions. See logs for details.' % nb_failed)

## Testing

### Try to find geo_components

In [35]:
ref_geo_components = get_geo_components()

geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'AFG', 
        'type': 'alpha-3'
    }
}

find_geo_component(geo_component, ref_geo_components)

ObjectId('609c10bf3814bdc62fbb9b62')

In [36]:
geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'FR', 
        'type': 'alpha-3'
    }
}

print(find_geo_component(geo_component, ref_geo_components))

None


### Insert some emissions (gcp)

In [37]:
# Read gcp file
with open('./../data/01_raw/gcp-carbon-v2.json', 'r') as f:
    lines = f.readlines()

# Transform to json
emissions = [
    cast_json(line) for line in lines
]

In [38]:
emissions[:5]

[{'data_source': {'name': 'GCP'},
  'geo_component': {'type': 'country',
   'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
  'date': '1949-01-01',
  'emission': {'gas': 'CO2',
   'value': 0.004,
   'unit': 'MtC',
   'sector_name': 'Territorial Emissions',
   'subsector_name': 'Coal'}},
 {'data_source': {'name': 'GCP'},
  'geo_component': {'type': 'country',
   'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
  'date': '1949-01-01',
  'emission': {'gas': 'CO2',
   'value': 0.0,
   'unit': 'MtC',
   'sector_name': 'Territorial Emissions',
   'subsector_name': 'Oil'}},
 {'data_source': {'name': 'GCP'},
  'geo_component': {'type': 'country',
   'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
  'date': '1949-01-01',
  'emission': {'gas': 'CO2',
   'value': 0.0,
   'unit': 'MtC',
   'sector_name': 'Territorial Emissions',
   'subsector_name': 'Gas'}},
 {'data_source': {'name': 'GCP'},
  'geo_component': {'type': 'country',
   'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
  'date': '194

In [39]:
# Insert first 5000 emissions
insert_emissions(emissions[:5000])

100%|██████████| 5000/5000 [00:06<00:00, 763.97it/s]

2021-05-12 19:44:58,852 - test_emissions_integration - INFO - Succesfully inserted 5000 emissions.





### KO emissions

In [40]:
# Define some wrong emissions

KO_emissions = [
    
    # OK
    {'data_source': {'name': 'GCP'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.004,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Coal'}},
    
    # Wrong geo component
    {'data_source': {'name': 'GCP'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'FRRRRRRRRRRRRRRRRRR', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.0,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Oil'}},
    
    # Wrong data source
    {'data_source': {'name': 'OKF'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.0,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Gas'}}
]
 

In [41]:
insert_emissions(KO_emissions)

  0%|          | 0/3 [00:00<?, ?it/s]

2021-05-12 19:44:58,893 - test_emissions_integration - ERROR - Failed to insert emission: geo_component not found: {'type': 'country', 'identifier': {'id': 'FRRRRRRRRRRRRRRRRRR', 'type': 'alpha-3'}}
2021-05-12 19:44:58,894 - test_emissions_integration - ERROR - Failed to insert emission: data_source not found: {'name': 'OKF'}


100%|██████████| 3/3 [00:00<00:00, 653.59it/s]

2021-05-12 19:44:58,895 - test_emissions_integration - INFO - Succesfully inserted 1 emissions.





## Collection exploration


All emissions:

In [42]:
list(emissions_collection.find({}))

[{'_id': ObjectId('609c1414549ec4de45ae6a38'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.004,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Coal'},
 {'_id': ObjectId('609c1414549ec4de45ae6a39'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Oil'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3a'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Gas'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3b'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62

Emissions on 1949-01-01

In [43]:
list(emissions_collection.find({'date': '1949-01-01'}))

[{'_id': ObjectId('609c1414549ec4de45ae6a38'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.004,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Coal'},
 {'_id': ObjectId('609c1414549ec4de45ae6a39'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Oil'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3a'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Gas'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3b'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62

Afghanistan's emissions

In [44]:
geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'AFG', 
        'type': 'alpha-3'
    }
}

afg_id = find_geo_component(geo_component, ref_geo_components)

afg_emissions = list(emissions_collection.find({'geo_component_id': afg_id}))

In [45]:
afg_emissions

[{'_id': ObjectId('609c1414549ec4de45ae6a38'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.004,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Coal'},
 {'_id': ObjectId('609c1414549ec4de45ae6a39'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Oil'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3a'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'data_source_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'date': '1949-01-01',
  'gas': 'CO2',
  'value': 0.0,
  'unit': 'MtC',
  'sector': 'Territorial Emissions',
  'subsector': 'Gas'},
 {'_id': ObjectId('609c1414549ec4de45ae6a3b'),
  'geo_component_id': ObjectId('609c10bf3814bdc62fbb9b62

In [25]:
print(len(afg_emissions))

356
