In [54]:
import os
import sys
import pandas as pd
import logging
from datetime import datetime, date
from google.cloud import bigquery
from hashlib import md5

# emission data file name
filename = './data/Emissions_by_Country_2002-2022.csv'
PROJECT_NAME = "emissions-team-project"
DATASET_NAME = "emissions"

# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'fct_emissions': {
        'table_name': 'fct_emissions',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('country_code_year', 'string', mode='REQUIRED', max_length=128),
            bigquery.SchemaField('country_code', 'string', mode='REQUIRED'),
            bigquery.SchemaField('year', 'date', mode='REQUIRED'),
            bigquery.SchemaField('total_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('coal_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('oil_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('gas_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('cement_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('flaring_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('other_em', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('created_at', 'float64', mode='REQUIRED'),
            bigquery.SchemaField('modified_at', 'float64', mode='REQUIRED'),
        ],
    }
}

# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-18 09:09:09,051][3564589987:0049] : Creating bigquery client
[INFO ][2023-01-18 09:09:09,073][3564589987:0052] : Setup Completed


## Loading Emission_by_Country CSV File

In [69]:
# *** always perform checks first ***
# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(
    filename, 
    header=0,
    parse_dates=['Year'], 
    infer_datetime_format=True, 
    on_bad_lines='warn',
    )
logger.info(f"loaded {len(df.index)} rows from {filename}")

# *** always perform check at the end ***
# check schema: contains all expected columns?
expected_columns = ['Country', 'ISO 3166-1 alpha-3', 'Year', 'Total', 'Coal', 'Oil', 'Gas', 'Cement', 'Flaring', 'Other', 'Per Capita']
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

#convert columns name to all lower case
df= df.rename(columns=str.lower)
df = df.rename(columns={'iso 3166-1 alpha-3': 'country_code','total': 'total_em', 'coal': 'coal_em', 'oil': 'oil_em', 'gas': 'gas_em', 'cement': 'cement_em', 'flaring': 'flaring_em', 'other': 'other_em'})

# log data column data types
logger.debug(df.dtypes)
display(df.head(5))

[INFO ][2023-01-18 09:22:47,537][2264186883:0015] : loaded 63104 rows from ./data/Emissions_by_Country_2002-2022.csv
[DEBUG][2023-01-18 09:22:47,544][2264186883:0028] : country                 object
country_code            object
year            datetime64[ns]
total_em               float64
coal_em                float64
oil_em                 float64
gas_em                 float64
cement_em              float64
flaring_em             float64
other_em               float64
per capita             float64
dtype: object


Unnamed: 0,country,country_code,year,total_em,coal_em,oil_em,gas_em,cement_em,flaring_em,other_em,per capita
0,Afghanistan,AFG,1750-01-01,0.0,,,,,,,
1,Afghanistan,AFG,1751-01-01,0.0,,,,,,,
2,Afghanistan,AFG,1752-01-01,0.0,,,,,,,
3,Afghanistan,AFG,1753-01-01,0.0,,,,,,,
4,Afghanistan,AFG,1754-01-01,0.0,,,,,,,


In [70]:
#drop rows if certain columns are Nan
df = df.dropna(subset = ['country', 'country_code', 'year','total_em', 'coal_em', 'oil_em', 'gas_em','cement_em', 'flaring_em', 'other_em'])

#add created_at and modified_at columns
df['created_at'] = pd.Timestamp('today').strftime("%Y-%m-%d")
df['modified_at'] = 'None'

display(df.head(20))

Unnamed: 0,country,country_code,year,total_em,coal_em,oil_em,gas_em,cement_em,flaring_em,other_em,per capita,created_at,modified_at
3232,Australia,AUS,1990-01-01,278.154156,141.879819,88.84209,34.454816,3.462872,7.272496,2.242063,16.315938,2023-01-18,
3233,Australia,AUS,1991-01-01,279.52851,146.08284,88.245572,32.786243,3.183033,7.001201,2.229622,16.184767,2023-01-18,
3234,Australia,AUS,1992-01-01,284.525345,150.051381,87.916828,33.970472,2.923411,7.303701,2.359551,16.293502,2023-01-18,
3235,Australia,AUS,1993-01-01,288.870537,150.098575,90.386578,35.670002,3.004698,7.136743,2.573941,16.383765,2023-01-18,
3236,Australia,AUS,1994-01-01,293.696553,151.376241,91.924087,37.032005,3.484276,6.880148,2.999795,16.494706,2023-01-18,
3237,Australia,AUS,1995-01-01,305.002996,155.718942,96.126544,39.830983,3.35775,7.060714,2.908064,16.941788,2023-01-18,
3238,Australia,AUS,1996-01-01,311.886129,160.375191,98.697016,39.765246,3.243616,6.761052,3.044008,17.125455,2023-01-18,
3239,Australia,AUS,1997-01-01,320.28263,167.670194,98.388038,41.074314,3.184589,6.687345,3.27815,17.396973,2023-01-18,
3240,Australia,AUS,1998-01-01,334.075978,177.596365,99.146315,43.488447,3.475413,6.901959,3.46748,17.959464,2023-01-18,
3241,Australia,AUS,1999-01-01,343.488633,183.215379,100.399487,45.548715,3.519991,7.165139,3.639921,18.269805,2023-01-18,


In [71]:
from hashlib import md5

cols = ['country_code', 'year']
# first define a generic function that returns the md4 hash for
# any combination of values
def get_hash(*cols) -> str:
    """return the md5 hash of all parameters"""
    value = '-'.join([str(x) for x in cols])
    return md5(value.encode(encoding='utf-16')).hexdigest()

logger.info(f"assigning country_code_year: using md5 hash of country_code and year")

# customer_id = md5 hash of customer first and last name
df['country_code_year'] = df.apply(lambda row: get_hash(row.country_code, row.year), axis=1)
# set index by customer_id
df = df.set_index(keys='country_code_year')

logger.info(f"country_code_year generated")
display(df)

[INFO ][2023-01-18 09:23:10,162][1071718710:0011] : assigning country_code_year: using md5 hash of country_code and year
[INFO ][2023-01-18 09:23:10,380][1071718710:0018] : country_code_year generated


Unnamed: 0_level_0,country,country_code,year,total_em,coal_em,oil_em,gas_em,cement_em,flaring_em,other_em,per capita,created_at,modified_at
country_code_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
afccb5304299f4edf1b36d8899527798,Australia,AUS,1990-01-01,278.154156,141.879819,88.842090,34.454816,3.462872,7.272496,2.242063,16.315938,2023-01-18,
af9ebf4b67c7e65474635758c1f86ba1,Australia,AUS,1991-01-01,279.528510,146.082840,88.245572,32.786243,3.183033,7.001201,2.229622,16.184767,2023-01-18,
b9e70bbedaf32b26d2ebe45a8e9325c1,Australia,AUS,1992-01-01,284.525345,150.051381,87.916828,33.970472,2.923411,7.303701,2.359551,16.293502,2023-01-18,
351c885a5a3a0e27808482fea7066983,Australia,AUS,1993-01-01,288.870537,150.098575,90.386578,35.670002,3.004698,7.136743,2.573941,16.383765,2023-01-18,
5518aad5448b09fce5e9b5ad0937a11b,Australia,AUS,1994-01-01,293.696553,151.376241,91.924087,37.032005,3.484276,6.880148,2.999795,16.494706,2023-01-18,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59495ce6483e0a2b64ce8cf5bcad6f36,Global,WLD,2017-01-01,36096.739276,14506.973805,12242.627935,7144.928128,1507.923185,391.992176,302.294047,4.749682,2023-01-18,
bd367da414c7a24180f8bd4e8ba4e7ec,Global,WLD,2018-01-01,36826.506600,14746.830688,12266.016285,7529.846784,1569.218392,412.115746,302.478706,4.792753,2023-01-18,
1d4f24b12858fbb44ed6c372a36273a2,Global,WLD,2019-01-01,37082.558969,14725.978025,12345.653374,7647.528220,1617.506786,439.253991,306.638573,4.775633,2023-01-18,
1f47d6c452c897f05dc587313b67489d,Global,WLD,2020-01-01,35264.085734,14174.564010,11191.808551,7556.290283,1637.537532,407.583673,296.301685,4.497423,2023-01-18,
