In [1]:
%reload_kedro

2021-05-12 19:30:15,277 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2021-05-12 19:30:15,326 - root - INFO - ** Kedro project OGS Connectors
2021-05-12 19:30:15,327 - root - INFO - Defined global variable `context`, `session` and `catalog`
2021-05-12 19:30:15,331 - root - INFO - Registered line magic `run_viz`


In [2]:
import pymongo
import pandas as pd
import os
import json

In [3]:
catalog.list()

['data_sources',
 'ademe_assessments',
 'ademe_emissions',
 'ademe_legal_units',
 'ademe_scope_items',
 'ademe_texts',
 'ademe_merged',
 'gcp',
 'parameters']

In [9]:
DATABASE_NAME = 'ogs'
GEO_COMPONENTS_COLLECTION_NAME = 'geo_components'
DATA_SOURCES_COLLECTION_NAME = 'data_sources'

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client.get_database(DATABASE_NAME)

In [10]:
def cast_json(line):
    try:
        json_sample = json.loads(line)
    except:
        print('failed to parse to json:', line)
        return {}
    return json_sample

# geo_components

Delete existing geo_components collection

In [11]:
db.drop_collection(GEO_COMPONENTS_COLLECTION_NAME)
db.create_collection(GEO_COMPONENTS_COLLECTION_NAME)
geo_components_collection = db[GEO_COMPONENTS_COLLECTION_NAME]

In [12]:
## Create some geo_components from GCP data
def get_sample_countries_geo_components():
    # Read gcp file
    with open('./../data/01_raw/gcp-carbon-v2.json', 'r') as f:
        lines = f.readlines()
        
    # Transform to json
    jsons = [
        cast_json(line) for line in lines
    ]
    
    # Extract geo components
    geo_components = [
        json_sample['geo_component'] for json_sample in jsons
    ]
    
    # Distinct dicts
    countries = list({v['identifier']['id']:v for v in geo_components}.values())
    
    # Format
    ref_countries = [
        {
            'type': 'Country',
            'identifiers': {
                'alpha-3': country['identifier']['id']
            }
        }
        for country in countries
    ]
       
    return ref_countries

In [13]:
countries = get_sample_countries_geo_components()

geo_components_collection.insert_many(countries)

<pymongo.results.InsertManyResult at 0x1068af100>

# data_sources

In [14]:
db.drop_collection(DATA_SOURCES_COLLECTION_NAME)
db.create_collection(DATA_SOURCES_COLLECTION_NAME)
data_sources_collection = db[DATA_SOURCES_COLLECTION_NAME]

In [15]:
data_sources = catalog.load('data_sources')

data_sources = data_sources.split('\n')

data_sources = [
    cast_json(data_source) for data_source in data_sources
]

2021-05-12 19:30:41,550 - kedro.io.data_catalog - INFO - Loading data from `data_sources` (TextDataSet)...


In [16]:
data_sources

[{'name': 'GCP', 'url': 'https://www.globalcarbonproject.org'},
 {'name': 'ADEME', 'url': 'https://www.ademe.fr'},
 {'name': 'WRI', 'url': 'https://www.wri.org'}]

In [17]:
data_sources_collection.insert_many(data_sources)

<pymongo.results.InsertManyResult at 0x133dcffc0>

# Exploration

In [18]:
list(geo_components_collection.find({}))

[{'_id': ObjectId('609c10bf3814bdc62fbb9b62'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'AFG'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b63'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'ALB'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b64'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'DZA'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b65'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'AND'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b66'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'AGO'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b67'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'AIA'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b68'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'ATA'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b69'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'ATG'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b6a'),
  'type': 'Country',
  'identifiers': {'alpha-3': 'ARG'}},
 {'_id': ObjectId('609c10bf3814bdc62fbb9b6b'),

In [19]:
list(data_sources_collection.find({}))

[{'_id': ObjectId('609c10c33814bdc62fbb9c3e'),
  'name': 'GCP',
  'url': 'https://www.globalcarbonproject.org'},
 {'_id': ObjectId('609c10c33814bdc62fbb9c3f'),
  'name': 'ADEME',
  'url': 'https://www.ademe.fr'},
 {'_id': ObjectId('609c10c33814bdc62fbb9c40'),
  'name': 'WRI',
  'url': 'https://www.wri.org'}]