In [1]:
import sqlalchemy as sqlalchemy_package
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import relationship
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.declarative import DeferredReflection
from sqlalchemy import *
import pandas as pd
import seaborn as sns
import warnings
import config as creds
from openpyxl import load_workbook
import pickle

sns.set()
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 40)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Set up a connection to the postgres server
DATABASE_URL = f"postgres://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5433/{creds.PGDATABASE}"
engine = sqlalchemy_package.create_engine(DATABASE_URL)

## Define Schema and Tables for Global Usage Table

In [3]:
Base = declarative_base()

# Creating the schema

class Consumption(Base):
    __tablename__ = "bp_overall_consumption"
    __table_args__ = {'schema': 'countries'}
    id = Column(Integer, Sequence('user_id_seq'), primary_key=True)
    code = Column(String)
    country = Column(String)
    year = Column(Integer)
    oil_consumed = Column(Float)
    natural_gas_consumed = Column(Float)
    coal_consumed = Column(Float)
    nuclear_consumed = Column(Float)
    hydroelectric_consumed = Column(Float)
    renewables_consumed = Column(Float)
    

In [4]:
# Set up a connection to the postgres server
class postgre_sql():
    def __init__(self, dbase = creds.PGDATABASE, dbschema = 'countries'):
        DATABASE_URL = f"postgres://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5433/{dbase}"
        engine = sqlalchemy_package.create_engine(DATABASE_URL,
                connect_args={'options': '-csearch_path={}'.format(dbschema)})
        self.engine = engine
    def declare(self):
        # Declare a Base with explicit schema using `automap_base()`
        Base = declarative_base()
        return Base

In [5]:
# Creating objects
countries_db = postgre_sql()
base_db = countries_db.declare()

In [6]:
Consumption.__table__.create(bind=countries_db.engine, checkfirst=True)

## Clean, Transform and Normalize Global Usage Data from BP

In [7]:
# Open the excel file using openpyxl

filepath = (r"..\resources\bp-stats-review-2020-all-data.xlsx")
wb = load_workbook(filename = filepath, read_only=True)
ws = wb["Primary Energy - Cons by fuel"]
df = pd.DataFrame(ws.values)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Primary energy: Consumption by fuel*,,,,,,,,,,,,,,,,Contents
1,,,,,,,,2018,,,,,,,2019,,
2,Exajoules,Oil,Natural Gas,Coal,Nuclear energy,Hydro electric,Renew- ables,Total,Oil,Natural Gas,Coal,Nuclear energy,Hydro electric,Renew- ables,Total,,
3,,,,,,,,,,,,,,,,,
4,Canada,4.59,4.26,0.65,0.90,3.45,0.50,14.35,4.50,4.33,0.56,0.90,3.41,0.52,14.21,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,European Union,26.49,16.46,9.37,7.40,3.12,6.97,69.81,26.39,16.90,7.69,7.33,2.94,7.54,68.81,,
94,,,,,,,,,,,,,,,,,
95,"* In this review, primary energy com...",,,,,,,,,,,,,,,,
96,Energy from all sources of non-fossi...,,,,,,,,,,,,,,,,


### First get clean data for 2018

In [8]:
# isolate 2018 and remove unneeded rows from top and bottom
df_2018 = df.iloc[3:88,:7]

# Rename columns
df18_renamecolumns = df_2018.rename(columns={0: "country",
                                     1: "oil_consumed",
                                     2: "natural_gas_consumed",
                                     3: "coal_consumed",
                                     4: "nuclear_consumed",
                                     5: "hydroelectric_consumed",
                                     6: "renewables_consumed"})

# Remove the 'none' rows
df18_removenone = df18_renamecolumns.loc[~df18_renamecolumns["country"].str.contains("None", na = True)]

# Remove the 'total' rows
df18_clean = df18_removenone.loc[~df18_removenone["country"].str.contains("Total", na = True)]

# add the year to the clean data
df18_clean["year"] = 2018
df18_final = df18_clean.reset_index(drop = True)
df18_final

Unnamed: 0,country,oil_consumed,natural_gas_consumed,coal_consumed,nuclear_consumed,hydroelectric_consumed,renewables_consumed,year
0,Canada,4.59,4.26,0.65,0.90,3.45,0.50,2018
1,Mexico,3.48,3.15,0.57,0.12,0.29,0.22,2018
2,US,37.11,29.52,13.28,7.60,2.59,5.50,2018
3,Argentina,1.20,1.75,0.05,0.06,0.37,0.10,2018
4,Brazil,4.69,1.29,0.70,0.14,3.48,1.83,2018
...,...,...,...,...,...,...,...,...
67,Sri Lanka,0.23,0,0.06,0,0.06,0.01,2018
68,Taiwan,2.04,0.85,1.70,0.25,0.04,0.06,2018
69,Thailand,2.68,1.80,0.80,0,0.07,0.24,2018
70,Vietnam,1.02,0.35,1.59,0,0.76,0.00,2018


### Repeat for 2019

In [9]:
# isolate 2019 and remove unneeded rows from top and bottom
df_2019 = df.iloc[3:88,[0,8,9,10,11,12,13]]
                  
# Rename columns
df19_renamecolumns = df_2019.rename(columns={0: "country",
                                     8: "oil_consumed",
                                     9: "natural_gas_consumed",
                                     10: "coal_consumed",
                                     11: "nuclear_consumed",
                                     12: "hydroelectric_consumed",
                                     13: "renewables_consumed"})

# Remove the 'none' rows
df19_removenone = df19_renamecolumns.loc[~df19_renamecolumns["country"].str.contains("None", na = True)]

# Remove the 'total' rows
df19_clean = df19_removenone.loc[~df19_removenone["country"].str.contains("Total", na = True)]

# add the year to the clean data
df19_clean["year"] = 2019
df19_final = df19_clean.reset_index(drop = True)
df19_final

Unnamed: 0,country,oil_consumed,natural_gas_consumed,coal_consumed,nuclear_consumed,hydroelectric_consumed,renewables_consumed,year
0,Canada,4.50,4.33,0.56,0.90,3.41,0.52,2019
1,Mexico,3.29,3.26,0.51,0.10,0.21,0.35,2019
2,US,36.99,30.48,11.34,7.60,2.42,5.83,2019
3,Argentina,1.19,1.71,0.02,0.08,0.33,0.14,2019
4,Brazil,4.73,1.29,0.66,0.14,3.56,2.02,2019
...,...,...,...,...,...,...,...,...
67,Sri Lanka,0.25,0,0.06,0,0.04,0.01,2019
68,Taiwan,1.93,0.84,1.63,0.29,0.05,0.07,2019
69,Thailand,2.72,1.83,0.71,0,0.06,0.29,2019
70,Vietnam,1.07,0.35,2.07,0,0.58,0.04,2019


### Combine the datasets

In [10]:
df_bothyears = pd.merge(df18_final, df19_final, how="outer")
df_bothyears

Unnamed: 0,country,oil_consumed,natural_gas_consumed,coal_consumed,nuclear_consumed,hydroelectric_consumed,renewables_consumed,year
0,Canada,4.59,4.26,0.65,0.90,3.45,0.50,2018
1,Mexico,3.48,3.15,0.57,0.12,0.29,0.22,2018
2,US,37.11,29.52,13.28,7.60,2.59,5.50,2018
3,Argentina,1.20,1.75,0.05,0.06,0.37,0.10,2018
4,Brazil,4.69,1.29,0.70,0.14,3.48,1.83,2018
...,...,...,...,...,...,...,...,...
139,Sri Lanka,0.25,0.00,0.06,0.00,0.04,0.01,2019
140,Taiwan,1.93,0.84,1.63,0.29,0.05,0.07,2019
141,Thailand,2.72,1.83,0.71,0.00,0.06,0.29,2019
142,Vietnam,1.07,0.35,2.07,0.00,0.58,0.04,2019


## Attempt to normalize the country names

In [11]:
# Transform Countries Data from CSV
countries_df = pd.read_csv('../countries/data/en/countries.csv')
countries =[]
for result in countries_df.itertuples():
    row = {}
    row['country_code'] = result.alpha3.upper()
    row['country'] = result.name
    countries.append(row)
print('List of official names of 206 world countries:\n', [row['country'] for row in countries])

List of official names of 206 world countries:
 ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Congo, Democratic Republic of the', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Hond

In [12]:
not_in_list = []
for r in df19_final.itertuples():
    if (r[1] not in [row['country'] for row in countries]) and (r[1] not in not_in_list):
        not_in_list.append(r[1])
print('Number of inconsistent records: ', len(not_in_list))
print('\nList of records, which are not consistent with the country list:\n', not_in_list)

Number of inconsistent records:  16

List of records, which are not consistent with the country list:
 ['US', 'Trinidad & Tobago', 'Venezuela', 'Other S. & Cent. America', 'Czech Republic', 'United Kingdom', 'Other Europe', 'Other CIS', 'Iran', 'Other Middle East', 'Other Africa', 'China Hong Kong SAR', 'South Korea', 'Taiwan', 'Vietnam', 'Other Asia Pacific']


In [13]:
# Import list of countries with codes
df_countrycode = pd.read_csv("../outputs/data_countries.csv")
countries_clean =[]
for result in df_countrycode.itertuples():
    row = {}
    row['country_code'] = result.country_code
    row['country'] = result.country
    countries_clean.append(row)
print('List of country names for code lookup:\n', [row['country'] for row in countries])

List of country names for code lookup:
 ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Congo, Democratic Republic of the', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', '

In [14]:
not_in_list2 = []
for r in df19_final.itertuples():
    if (r[1] not in [row['country'] for row in countries_clean]) and (r[1] not in not_in_list):
        not_in_list2.append(r[1])
print('Number of inconsistent records: ', len(not_in_list))
print('\nList of records, which are not consistent with the country list:\n', not_in_list)

Number of inconsistent records:  16

List of records, which are not consistent with the country list:
 ['US', 'Trinidad & Tobago', 'Venezuela', 'Other S. & Cent. America', 'Czech Republic', 'United Kingdom', 'Other Europe', 'Other CIS', 'Iran', 'Other Middle East', 'Other Africa', 'China Hong Kong SAR', 'South Korea', 'Taiwan', 'Vietnam', 'Other Asia Pacific']


In [15]:
remove_list = []
rename_dict = {'Aruba': 'Netherlands', 'Bermuda': 'United Kingdom', 'Cote d?Ivoire': 'Ivory Coast', 'Congo-Kinshasa': 'Democratic Republic of the Congo',
               'Congo-Brazzaville': 'Republic of Congo', 'Cook Islands': 'New Zealand', 'Cabo Verde':'Guinea Bissau', 'Gibraltar':'United Kingdom',
               'Guadeloupe':'France', 'Guinea-Bissau':'Guinea Bissau', 'French Guiana':'France', 'Hawaiian Trade Zone': 'United States of America',
               'Hong Kong':'Hong Kong S.A.R.', 'Macau':'China', 'Maldives':'United Kingdom', 'North Macedonia':'Macedonia', 'Burma':'Myanmar',
               'Montserrat':'United Kingdom', 'Martinique':'France', 'Netherlands Antilles':'Netherlands', 'Nauru':'Australia', 
               'Palestinian Territories':'Palestine','Reunion':'France', 'Serbia':'Republic of Serbia', 'Eswatini':'Swaziland',
               'Seychelles':'United Kingdom', 'Turks and Caicos Islands':'United Kingdom', 'Timor-Leste':'United Kingdom',
               'Tanzania':'United Republic of Tanzania', 'U.S. Virgin Islands':'United States Virgin Islands', 'United States':'United States of America', 'US': 'United States of America',
               'U.S. Pacific Islands':'United States of America', 'Saint Vincent/Grenadines':'Saint Vincent and the Grenadines', 'British Virgin Islands':'United Kingdom',
               'Wake Island':'United States of America', 'Trinidad & Tobago': 'Trinidad and Tobago', 'Russian Federation': 'Russia', 'China Hong Kong SAR': 'Hong Kong S.A.R.'}
norm_countries = pickle.load(open('../outputs/norm_countries.pkl', 'rb'))

In [16]:
# new_df = pd.DataFrame(columns = df_bothyears.columns)
# for row in df_bothyears.itertuples(index=False):
#     country = row[0]
#     if country in rename_dict.keys():
#         country = rename_dict[row[0]]
#         newrow = pd.DataFrame([(country, row[1], row[2], row[3], row[4], row[5], row[6], row[7])], columns = df_bothyears.columns)
#     else:
#         newrow = pd.DataFrame([row], columns = df_bothyears.columns)
#     print(type(newrow))
#     new_df = pd.merge(new_df, newrow, how="outer")
# new_df

In [17]:
# Cleaning records table using dataframes instead of lists
def clean_dataframe(dataframe, rename_dict=rename_dict):
    new_df = pd.DataFrame(columns = dataframe.columns)
    for row in dataframe.itertuples(index=False):
        country = row[0]
        if country in rename_dict.keys():
            country = rename_dict[row[0]]
            newrow = pd.DataFrame([(country, row[1], row[2], row[3], row[4], row[5], row[6], row[7])], columns = dataframe.columns)
        else:
            newrow = pd.DataFrame([row], columns = dataframe.columns)
        new_df = pd.merge(new_df, newrow, how="outer")
    return new_df

# Normalize Data Country_Code, using dataframes instead of lists
def norm_dataframe(data_list):
    new_df = pd.DataFrame(columns = dataframe.columns)
    for record in dataframe.itertuples(index=False):
        country = row[0]
        for row in norm_countries:
            if record['code'] == row['country']:
                record['code'] = row['country_code']
    return data_list

In [18]:
df_bothyearsclean = clean_dataframe(df_bothyears)
df_bothyearsclean

Unnamed: 0,country,oil_consumed,natural_gas_consumed,coal_consumed,nuclear_consumed,hydroelectric_consumed,renewables_consumed,year
0,Canada,4.59,4.26,0.65,0.90,3.45,0.50,2018
1,Mexico,3.48,3.15,0.57,0.12,0.29,0.22,2018
2,United States of America,37.11,29.52,13.28,7.60,2.59,5.50,2018
3,Argentina,1.20,1.75,0.05,0.06,0.37,0.10,2018
4,Brazil,4.69,1.29,0.70,0.14,3.48,1.83,2018
...,...,...,...,...,...,...,...,...
139,Sri Lanka,0.25,0.00,0.06,0.00,0.04,0.01,2019
140,Taiwan,1.93,0.84,1.63,0.29,0.05,0.07,2019
141,Thailand,2.72,1.83,0.71,0.00,0.06,0.29,2019
142,Vietnam,1.07,0.35,2.07,0.00,0.58,0.04,2019


## Add the country code to our dataframe

In [19]:
df_final = pd.merge(df_bothyearsclean, df_countrycode[['country','country_code']], how='left', on = "country", copy = False )
df_final

Unnamed: 0,country,oil_consumed,natural_gas_consumed,coal_consumed,nuclear_consumed,hydroelectric_consumed,renewables_consumed,year,country_code
0,Canada,4.59,4.26,0.65,0.90,3.45,0.50,2018,CAN
1,Mexico,3.48,3.15,0.57,0.12,0.29,0.22,2018,MEX
2,United States of America,37.11,29.52,13.28,7.60,2.59,5.50,2018,USA
3,Argentina,1.20,1.75,0.05,0.06,0.37,0.10,2018,ARG
4,Brazil,4.69,1.29,0.70,0.14,3.48,1.83,2018,BRA
...,...,...,...,...,...,...,...,...,...
139,Sri Lanka,0.25,0.00,0.06,0.00,0.04,0.01,2019,LKA
140,Taiwan,1.93,0.84,1.63,0.29,0.05,0.07,2019,TWN
141,Thailand,2.72,1.83,0.71,0.00,0.06,0.29,2019,THA
142,Vietnam,1.07,0.35,2.07,0.00,0.58,0.04,2019,VNM


## Write the dataframe to our Postgres Database

In [20]:
engine = countries_db.engine
df_final.to_sql('bp_overall_consumption',engine, if_exists='replace')