In [None]:
import pymongo
import pandas as pd
import os
import json
from tqdm import tqdm
import logging

logger = logging.getLogger('test_emissions_integration')
logger.setLevel('INFO')

In [None]:
catalog.list()

## MongoDB / Collection parameters

In [None]:
DATABASE_NAME = 'ogs'
GEO_COMPONENTS_COLLECTION_NAME = 'geo_components'
DATA_SOURCES_COLLECTION_NAME = 'data_sources'
EMISSIONS_COLLECTION_NAME = 'emissions'

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client.get_database(DATABASE_NAME)

In [None]:
geo_components_collection = db[GEO_COMPONENTS_COLLECTION_NAME]
data_sources_collection = db[DATA_SOURCES_COLLECTION_NAME]
emissions_collection = db[EMISSIONS_COLLECTION_NAME]

In [None]:
# db.drop_collection(EMISSIONS_COLLECTION_NAME)

## Helper functions

In [None]:
def get_geo_components():
    return list(geo_components_collection.find({}))

def get_data_sources():
    return list(data_sources_collection.find({}))

def cast_json(line):
    try:
        json_sample = json.loads(line)
    except:
        logging.error('Failed to parse to json:', line)
        return {}
    return json_sample

## Core functions

In [None]:
def find_geo_component(geo_component, ref_geo_components):
    """
    Given a geo_component, return its id in the referential, None if not found
    """
    # Get geo_component identifier type
    identifier_type = geo_component['identifier']['type']
    
    for ref_geo_component in ref_geo_components:
        # Check whether or not identifier is known for this ref_geo_component
        if identifier_type not in ref_geo_component['identifiers']:
            continue
            
        if ref_geo_component['identifiers'][identifier_type] == geo_component['identifier']['id']:
            return ref_geo_component['_id']
        
    return None   


def find_data_source(data_source, ref_data_sources):
    """
    Given a data_source, return its id in the referential, None if not found
    """
    for ref_data_source in ref_data_sources:
        if data_source['name'] == ref_data_source['name']:
            return ref_data_source['_id']
    
    return None

In [None]:
class InsertEmissionError(Exception):
    pass


def insert_emission(emission, ref_geo_components, ref_data_sources):
    """
    Given an emission and referentials, try to insert the emission
    """
    # Get geo_component id
    geo_component_id = find_geo_component(
        geo_component=emission['geo_component'],
        ref_geo_components=ref_geo_components
    )
    
    if not geo_component_id:
        raise InsertEmissionError('geo_component not found: %s' % emission['geo_component'])
        
    # Get data_source id
    data_source_id = find_data_source(
        data_source=emission['data_source'],
        ref_data_sources=ref_data_sources
    )
    
    if not data_source_id:
        raise InsertEmissionError('data_source not found: %s' % emission['data_source'])
        
    # Format document
    emission_document = {
        'geo_component_id': geo_component_id,
        'data_source_id': data_source_id,
        'date': emission['date'],
        'gas': emission['emission']['gas'],
        'value': emission['emission']['value'],
        'unit': emission['emission']['unit'],
        'sector': emission['emission']['sector_name'],
        'subsector': emission['emission']['subsector_name']        
    }
    
    emissions_collection.insert_one(emission_document)
    

def insert_emissions(emissions):
    """
    Given a list of emissions, insert them into the collection
    """
    ref_geo_components = get_geo_components()
    ref_data_sources = get_data_sources()
    
    nb_inserted = 0
    nb_failed = 0
    
    for emission in tqdm(emissions):
        try:
            insert_emission(emission, ref_geo_components, ref_data_sources)
            nb_inserted += 1
        except InsertEmissionError as e:
            logger.error('Failed to insert emission: %s' % e)
            nb_failed += 1
            
    logger.info('Succesfully inserted %s emissions.' % nb_inserted)
    if nb_failed:
        logger.warning('Failed to insert %s emissions. See logs for details.' % nb_failed)

## Testing

### Try to find geo_components

In [None]:
ref_geo_components = get_geo_components()

geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'AFG', 
        'type': 'alpha-3'
    }
}

find_geo_component(geo_component, ref_geo_components)

In [None]:
geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'FR', 
        'type': 'alpha-3'
    }
}

print(find_geo_component(geo_component, ref_geo_components))

### Insert some emissions (gcp)

In [None]:
# Read gcp file
with open('./../data/01_raw/gcp-carbon-v2.json', 'r') as f:
    lines = f.readlines()

# Transform to json
emissions = [
    cast_json(line) for line in lines
]

In [None]:
emissions[:5]

In [None]:
# Insert first 5000 emissions
insert_emissions(emissions[:5000])

### KO emissions

In [None]:
# Define some wrong emissions

KO_emissions = [
    
    # OK
    {'data_source': {'name': 'GCP'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.004,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Coal'}},
    
    # Wrong geo component
    {'data_source': {'name': 'GCP'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'FRRRRRRRRRRRRRRRRRR', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.0,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Oil'}},
    
    # Wrong data source
    {'data_source': {'name': 'OKF'},
    'geo_component': {'type': 'country',
    'identifier': {'id': 'AFG', 'type': 'alpha-3'}},
    'date': '1949-01-01',
    'emission': {'gas': 'CO2',
    'value': 0.0,
    'unit': 'MtC',
    'sector_name': 'Territorial Emissions',
    'subsector_name': 'Gas'}}
]
 

In [None]:
insert_emissions(KO_emissions)

## Collection exploration


All emissions:

In [None]:
list(emissions_collection.find({}))

Emissions on 1949-01-01

In [None]:
list(emissions_collection.find({'date': '1949-01-01'}))

Afghanistan's emissions

In [None]:
geo_component = {
    'type': 'country',
    'identifier': {
        'id': 'AFG', 
        'type': 'alpha-3'
    }
}

afg_id = find_geo_component(geo_component, ref_geo_components)

afg_emissions = list(emissions_collection.find({'geo_component_id': afg_id}))

In [None]:
afg_emissions

In [None]:
print(len(afg_emissions))