In [1]:
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
import pandas as pd
import datetime as dt
import gmaps
import gmaps.geojson_geometries

import sqlalchemy as sqlalchemy_package
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import relationship
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.declarative import DeferredReflection
from sqlalchemy import *

import requests
import pickle
import config as creds
from urllib.parse import urlencode, urlparse, parse_qsl

from my_func import GoogleMapClient
# Import my GoogleMap object with following methods:
        # extract_lat_lng(location)
        # search(keyword, radius = 1000, location=None)
        # detail(place_id, fields=["name", "rating", "formatt

In [2]:
# Set up a connection to the postgres server
class postgre_sql():
    def __init__(self, dbase = creds.PGDATABASE, dbschema = 'countries'):
        DATABASE_URL = f"postgres://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5433/{dbase}"
        engine = sqlalchemy_package.create_engine(DATABASE_URL,
                connect_args={'options': '-csearch_path={}'.format(dbschema)})
        self.engine = engine
    def declare(self):
        # Declare a Base with explicit schema using `automap_base()`
        Base = declarative_base()
        return Base

# US Energy Information Administration API Query Browser, Open Data Source
class EIA_Client():
    api_key=None
    def __init__(self, api_key=creds.eai_key, data='category', *args, **kwargs):
        super().__init__(*args, **kwargs)
        if api_key == None:
            raise Exception('Api key is required')
        self.api_key = api_key
        self. eia_url = f"http://api.eia.gov/{data}/"
    def category(self, cat = 1293027):
        params = {'api_key': self.api_key, 'category_id':cat}
        params_url = urlencode(params)
        url = f"{self.eia_url}?{params_url}"
        response = requests.get(url).json()
        return response
    def series(self, ser = 'INTL.55-1-AFG-TBPD.M'):
        params = {'api_key': self.api_key, 'series_id':ser}
        params_url = urlencode(params)
        url = f"{self.eia_url}?{params_url}"
        response = requests.get(url).json()
        return response

In [3]:
# Creating objects
countries_db = postgre_sql()
base_db = countries_db.declare()
eia_client = EIA_Client()

## Clean, Transform and Normalization of Countries Data from EIA

In [5]:
remove_list = ['Africa', 'Asia & Oceania', 'Central & South America', 'European Union', 'Eurasia', 'Europe', 'IEA',
              'Middle East', 'North America', 'Non-OECD', 'OECD - Asia And Oceania', 'OECD', 'OECD - Europe', 'OECD - North America',
               'OPEC - Africa', 'OPEC', 'Non-OPEC', 'OPEC - South America', 'Persian Gulf', 'Former Serbia and Montenegro', 'Former U.S.S.R.',
               'World', 'Australia and New Zealand', 'IEO - Africa', 'Mexico and Chile', 'IEO - Middle East', 'IEO OECD - Europe',
               'Other Non-OECD - America', 'Other Non-OECD - Asia', 'Other Non-OECD - Europe and Eurasia']
rename_dict = {'Aruba': 'Netherlands', 'Bermuda': 'United Kingdom', 'Cote d?Ivoire': 'Ivory Coast', 'Congo-Kinshasa': 'Democratic Republic of the Congo',
               'Congo-Brazzaville': 'Republic of Congo', 'Cook Islands': 'New Zealand', 'Cabo Verde':'Guinea Bissau', 'Gibraltar':'United Kingdom',
               'Guadeloupe':'France', 'Guinea-Bissau':'Guinea Bissau', 'French Guiana':'France', 'Hawaiian Trade Zone': 'United States of America',
               'Hong Kong':'Hong Kong S.A.R.', 'Macau':'China', 'Maldives':'United Kingdom', 'North Macedonia':'Macedonia', 'Burma':'Myanmar',
               'Montserrat':'United Kingdom', 'Martinique':'France', 'Netherlands Antilles':'Netherlands', 'Nauru':'Australia', 
               'Palestinian Territories':'Palestine','Reunion':'France', 'Serbia':'Republic of Serbia', 'Eswatini':'Swaziland',
               'Seychelles':'United Kingdom', 'Turks and Caicos Islands':'United Kingdom', 'Timor-Leste':'United Kingdom',
               'Tanzania':'United Republic of Tanzania', 'U.S. Virgin Islands':'United States Virgin Islands', 'United States':'United States of America',
               'U.S. Pacific Islands':'United States of America', 'Saint Vincent/Grenadines':'Saint Vincent and the Grenadines', 'British Virgin Islands':'United Kingdom',
               'Wake Island':'United States of America'
                  }
norm_countries = pickle.load(open('../outputs/norm_countries.pkl', 'rb'))

In [6]:
# Cleaning records for countries considering remove_list, rename_dict
def clean_data(data_list, remove_list=remove_list, rename_dict=rename_dict):
    clean_data = []
    for record in data_list:
        if record['code'] in rename_dict.keys():
            record['code'] = rename_dict[record['code']]
        if record['code'] not in remove_list:
            clean_data.append(record)
    return clean_data

# Normalize Data Country_Code
def norm_data(data_list):
    for record in data_list:
        for row in norm_countries:
            if record['code'] == row['country']:
                record['code'] = row['country_code']
    return data_list

#### Creating Countries List

In [7]:
# Transform Countries Data from CSV
countries_df = pd.read_csv('../countries/data/en/countries.csv')
countries =[]
for result in countries_df.itertuples():
    row = {}
    row['country_code'] = result.alpha3.upper()
    row['country'] = result.name
    countries.append(row)
print('List of official names of 206 world countries:\n', [row['country'] for row in countries])

List of official names of 206 world countries:
 ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Congo, Democratic Republic of the', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Hond

In [8]:
# List of 217 World Countries in gmaps built in geometry JSON file
countries_geojson = gmaps.geojson_geometries.load_geometry('countries')
gmaps_list = []
for feature in countries_geojson['features']:
    gmaps_list.append(feature['properties']['name'])
len(gmaps_list)

217

In [9]:
# Intersection between countries list (gmaps and CSV countries)
world_list = []
for row in countries:
    world_list.append(row['country'])
len([x for x in gmaps_list if x not in world_list])

56

#### Extracting Oil, NGPL and other Liquids Production Data by Countries

In [10]:
# Oil Production Data by Countries
eia_client.__init__(data='category')
production_json = eia_client.category(cat=2134947)
production_list = production_json['category']['childseries']
production_data = []
eia_client.__init__(data='series')
for record in production_list:
    series_id, series_name, series_units = record.get('series_id'), record.get('name'), record.get('units')
    series_name = series_name.split(", ", 5)
    if (series_name[4] == 'Monthly') & (series_name[3] not in [row['country'] for row in production_data]):
        series_json = eia_client.series(ser = series_id)
        series_data = series_json['series'][0]['data']
        for ser in series_data:
            row = {}
            row['code'] = series_name[3]
            row['country'] = series_name[3]
            row['year'] = int(str(ser[0])[:4])
            row['month'] = int(str(ser[0])[-2:])
            row['oil_prod'] = [0 if isinstance(ser[1], str) else ser[1]][0] 
            production_data.append(row)
        num_of_records = len(production_data)
        if num_of_records % 100 == 0: 
            print('Processing records: appended ', num_of_records,' of countries records')

Processing records: appended  3300  of countries records
Processing records: appended  6600  of countries records
Processing records: appended  9900  of countries records
Processing records: appended  13200  of countries records
Processing records: appended  16500  of countries records
Processing records: appended  19800  of countries records
Processing records: appended  23100  of countries records
Processing records: appended  26400  of countries records


In [11]:
# Check inconsistency in countries with monthly production data
not_in_list = []
for id in production_data:
    if (id['country'] not in [row['country'] for row in countries]) and (id['country'] not in not_in_list):
        not_in_list.append(id['country'])
print('Number of inconsistent records: ', len(not_in_list))
print('\nList of records, which are not consistent with the country list:\n', [not_in_list[i].title() for i in range(len(not_in_list))])

Number of inconsistent records:  84

List of records, which are not consistent with the country list:
 ['Aruba', 'Africa', 'American Samoa', 'Asia & Oceania', 'Antarctica', 'The Bahamas', 'Bermuda', 'Bolivia', 'Brunei', 'Cote D?Ivoire', 'Congo-Kinshasa', 'Congo-Brazzaville', 'Cook Islands', 'Central & South America', 'Cayman Islands', 'Czech Republic', 'Western Sahara', 'European Union', 'Eurasia', 'Europe', 'Falkland Islands', 'Faroe Islands', 'United Kingdom', 'Gibraltar', 'Guadeloupe', 'Greenland', 'French Guiana', 'Guam', 'Hawaiian Trade Zone', 'Hong Kong', 'Iran', 'South Korea', 'Laos', 'Macau', 'Moldova', 'Middle East', 'Burma', 'Montserrat', 'Martinique', 'New Caledonia', 'Niue', 'Netherlands Antilles', 'North America', 'Non-Oecd', 'Oecd - Asia And Oceania', 'Oecd', 'Oecd - Europe', 'Oecd - North America', 'Opec - Africa', 'Opec', 'Non-Opec', 'Opec - South America', 'Persian Gulf', 'Puerto Rico', 'North Korea', 'Palestinian Territories', 'French Polynesia', 'Reunion', 'Russia', 

In [12]:
# Check inconsistency in countries with monthly production data
not_in_list = []
for id in production_data:
    if (id['country'] not in gmaps_list) and (id['country'] not in not_in_list):
        not_in_list.append(id['country'])
print('Number of inconsistent records: ', len(not_in_list))
print('\nList of records, which are not consistent with the country list:\n', not_in_list)

Number of inconsistent records:  64

List of records, which are not consistent with the country list:
 ['Aruba', 'Africa', 'Asia & Oceania', 'Bermuda', 'Cote d?Ivoire', 'Congo-Kinshasa', 'Congo-Brazzaville', 'Cook Islands', 'Cabo Verde', 'Central & South America', 'European Union', 'Eurasia', 'Europe', 'Gibraltar', 'Guadeloupe', 'Guinea-Bissau', 'French Guiana', 'Hawaiian Trade Zone', 'Hong Kong', 'Macau', 'Maldives', 'Middle East', 'North Macedonia', 'Burma', 'Montserrat', 'Martinique', 'Netherlands Antilles', 'North America', 'Non-OECD', 'Nauru', 'OECD - Asia And Oceania', 'OECD', 'OECD - Europe', 'OECD - North America', 'OPEC - Africa', 'OPEC', 'Non-OPEC', 'OPEC - South America', 'Persian Gulf', 'Palestinian Territories', 'Reunion', 'Former Serbia and Montenegro', 'Serbia', 'Former U.S.S.R.', 'Eswatini', 'Seychelles', 'Turks and Caicos Islands', 'Timor-Leste', 'Tanzania', 'United States', 'U.S. Pacific Islands', 'Saint Vincent/Grenadines', 'British Virgin Islands', 'U.S. Virgin Isla

In [13]:
# Cleaning Data
clean_production_data = clean_data(production_data, remove_list, rename_dict)
# Adding country code in production data
norm_production_data = norm_data(clean_production_data)
len(norm_production_data)

71724

In [14]:
production_data[1000]

{'code': 'AGO',
 'country': 'Angola',
 'year': 2019,
 'month': 8,
 'oil_prod': 1519.241}

In [15]:
# Check inconsistency in countries with monthly production data
not_in_list = []
for id in norm_production_data:
    if (id['code'] not in [row['country_code'] for row in norm_countries]) and (id['code'] not in not_in_list):
        not_in_list.append(id['country'])
print('Number of inconsistent records: ', len(not_in_list))
print('\nList of records, which are not consistent with the country list:\n', not_in_list)

Number of inconsistent records:  0

List of records, which are not consistent with the country list:
 []


In [16]:
# Calculate the number of countries in normalized production data
eia_countries_list = []
for id in norm_production_data:
    if (id['country'] not in eia_countries_list):
        eia_countries_list.append(id['country'])
len(eia_countries_list)

218

In [17]:
# Intersection between countries list (gmaps and CSV countries)
not_eia_list = []
for row in gmaps_list:
    if row not in eia_countries_list:
        not_eia_list.append(row)
len(not_eia_list)

34

In [17]:
"""
norm_countries =[]
for country in gmaps_list:
    row = {}
    for record in countries:
        if country == record['country']:
            row['country_code'] = record['country_code']
            row['country'] = country
            norm_countries.append(row)
    if country not in world_list:
        code = input(f'Input the country code for {country}: ')
        row['country_code'] = code
        row['country'] = country
        norm_countries.append(row)
len(norm_countries)
"""
"""
for record in norm_countries:
    if record['country'] == 'Somaliland': record['country_code'] = 'SOL'
with open('outputs/norm_countries.pkl', 'wb') as f:
    pickle.dump(norm_countries, f)
"""

"\nfor record in norm_countries:\n    if record['country'] == 'Somaliland': record['country_code'] = 'SOL'\nwith open('outputs/norm_countries.pkl', 'wb') as f:\n    pickle.dump(norm_countries, f)\n"

#### Extracting Petroleum Production Data by Countries

In [18]:
# Refined Petroleum Products Data by Countries (category list by API request)
eia_client.__init__(data='category')
petrol_json = eia_client.category(cat=2134915)
petrol_list = petrol_json['category']['childseries']
# Series list by API request
eia_client.__init__(data='series')
petrol_data = []
for record in petrol_list:
    series_id, series_name, series_units = record.get('series_id'), record.get('name'), record.get('units')
    series_name = series_name.split(", ", 3)
    if (series_name[2] == 'Monthly') & (series_name[1] not in [row['country'] for row in petrol_data]):
        series_json = eia_client.series(ser = series_id)
        series_data = series_json['series'][0]['data']
        for ser in series_data:
            row = {}
            row['code'] = series_name[1]
            row['country'] = series_name[1]
            row['year'] = int(str(ser[0])[:4])
            row['month'] = int(str(ser[0])[-2:])
            row['petrol_prod'] = [0 if isinstance(ser[1], str) else ser[1]][0] 
            petrol_data.append(row)
        num_of_records = len(petrol_data)
        if num_of_records % 100 == 0: 
            print('Processing records: appended ', num_of_records,' of countries records')
            
print(len(petrol_data))

Processing records: appended  3300  of countries records
Processing records: appended  6600  of countries records
Processing records: appended  9900  of countries records
Processing records: appended  13200  of countries records
Processing records: appended  16500  of countries records
Processing records: appended  19800  of countries records
Processing records: appended  23100  of countries records
Processing records: appended  26400  of countries records
80976


In [19]:
# Cleaning Data
clean_petrol_data = clean_data(petrol_data, remove_list, rename_dict)
# Adding country code in production data
norm_petrol_data = norm_data(clean_petrol_data)
len(norm_petrol_data)

71724

## Define Schema and Tables for Countries Data

In [20]:
# Creating Schema Countries
class Countries(base_db):
    __tablename__ = "countries"
    country_code = Column(String, primary_key=True)
    country = Column(String)
    countries_oil = relationship('Oil_Production', backref = 'countries')
    countries_petrol = relationship('Petrol_Production', backref = 'countries')

class Oil_Production(base_db):
    __tablename__ = "oil_production"
    __table_args__ = {'schema': 'countries'}
    id = Column(Integer, Sequence('user_id_seq'), primary_key=True)
    code = Column(String, ForeignKey('countries.country_code'))
    country = Column(String)
    year = Column(Integer)
    month = Column(Integer)
    oil_prod = Column(Integer)
    
class Petrol_Production(base_db):
    __tablename__ = "petrol_production"
    __table_args__ = {'schema': 'countries'}
    id = Column(Integer, Sequence('user_id_seq'), primary_key=True)
    code = Column(String, ForeignKey('countries.country_code'))
    country = Column(String)
    year = Column(Integer)
    month = Column(Integer)
    petrol_prod = Column(Integer)

In [21]:
Countries.__table__.create(bind=countries_db.engine, checkfirst=True)
Oil_Production.__table__.create(bind=countries_db.engine, checkfirst=True)
Petrol_Production.__table__.create(bind=countries_db.engine, checkfirst=True)

## Load Data into the Database via Session API

In [22]:
Session = sessionmaker(bind=countries_db.engine)
session = Session()

# Uploading countries
for country in norm_countries:
    row = Countries(**country)
    session.add(row)

# Uploading production
for record in norm_petrol_data:
    row = Petrol_Production(**record)
    session.add(row)
    
# Uploading production
for record in norm_production_data:
    row = Oil_Production(**record)
    session.add(row)

session.commit()

#### Extracting Crude Oil Production Data by Countries

In [23]:
# Crude Oil Data by Countries (category list by API request)
eia_client.__init__(data='category')
crude_json = eia_client.category(cat=2134979)
crude_list = crude_json['category']['childseries']

In [24]:
# Series list by API request
eia_client.__init__(data='series')
crude_data = []
for record in crude_list:
    series_id, series_name, series_units = record.get('series_id'), record.get('name'), record.get('units')
    series_name = series_name.split(", ", 3)
    if (series_name[2] == 'Monthly') & (series_name[1] not in [row['country'] for row in crude_data]):
        series_json = eia_client.series(ser = series_id)
        series_data = series_json['series'][0]['data']
        for ser in series_data:
            row = {}
            row['code'] = series_name[1]
            row['country'] = series_name[1]
            row['year'] = int(str(ser[0])[:4])
            row['month'] = int(str(ser[0])[-2:])
            row['crude_prod'] = [0 if isinstance(ser[1], str) else ser[1]][0] 
            crude_data.append(row)
        num_of_records = len(crude_data)
        if num_of_records % 100 == 0: 
            print('Processing records: appended ', num_of_records,' of countries records')
            
print(len(crude_data))

Processing records: appended  5100  of countries records
Processing records: appended  8400  of countries records
Processing records: appended  11700  of countries records
Processing records: appended  12600  of countries records
Processing records: appended  13500  of countries records
Processing records: appended  17700  of countries records
Processing records: appended  33000  of countries records
Processing records: appended  38100  of countries records
Processing records: appended  43200  of countries records
Processing records: appended  47400  of countries records
Processing records: appended  50700  of countries records
Processing records: appended  55800  of countries records
85428


In [25]:
# Cleaning Data
clean_crude_data = clean_data(crude_data, remove_list, rename_dict)
# Adding country code in production data
norm_crude_data = norm_data(clean_crude_data)
len(norm_crude_data)

77004

In [26]:
class Crude_Production(base_db):
    __tablename__ = "crude_production"
    __table_args__ = {'schema': 'countries'}
    id = Column(Integer, Sequence('user_id_seq'), primary_key=True)
    code = Column(String)
    country = Column(String)
    year = Column(Integer)
    month = Column(Integer)
    crude_prod = Column(Integer)
    
Crude_Production.__table__.create(bind=countries_db.engine, checkfirst=True)    

In [27]:
Session = sessionmaker(bind=countries_db.engine)
session = Session()

# Uploading crude oil production
for record in norm_crude_data:
    row = Crude_Production(**record)
    session.add(row)
    
session.commit()

In [2]:
creds.PGHOST

'localhost'