In [16]:
import os
import sys
import pandas as pd
import logging
from datetime import datetime, date
from google.cloud import bigquery
from hashlib import md5

# receipts data file name
filename = './data/Emissions_by_Country_2002-2022.csv'

# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-17 15:02:53,501][724504573:0023] : Creating bigquery client
[INFO ][2023-01-17 15:02:53,791][724504573:0026] : Setup Completed


## Loading Emission_by_Country CSV File

In [45]:
# *** always perform checks first ***
# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(
    filename, 
    header=0,
    parse_dates=['Year'], 
    infer_datetime_format=True, 
    on_bad_lines='warn',
    )
logger.info(f"loaded {len(df.index)} rows from {filename}")

# *** always perform check at the end ***
# check schema: contains all expected columns?
expected_columns = ['Country', 'ISO 3166-1 alpha-3', 'Year', 'Total', 'Coal', 'Oil', 'Gas', 'Cement', 'Flaring', 'Other', 'Per Capita']
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

#rename column
df = df.rename(columns={'ISO 3166-1 alpha-3': 'country_code'})

#convert columns name to all lower case
df= df.rename(columns=str.lower)

# log data column data types
logger.debug(df.dtypes)

[INFO ][2023-01-17 16:03:53,175][3759513774:0015] : loaded 63104 rows from ./data/Emissions_by_Country_2002-2022.csv
[DEBUG][2023-01-17 16:03:53,184][3759513774:0031] : country                 object
country_code            object
year            datetime64[ns]
total                  float64
coal                   float64
oil                    float64
gas                    float64
cement                 float64
flaring                float64
other                  float64
per capita             float64
dtype: object


In [47]:
#drop rows if certain columns are Nan
df = df.dropna(subset=['country', 'country_code', 'year', 'total', 'coal', 'oil', 'gas', 'cement', 'flaring'])

#add created_at and modified_at columns
df['created_at'] = pd.Timestamp('today').strftime("%Y-%m-%d")
df['modified_at'] = 'None'

display(df.head(20))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,country,country_code,year,total,coal,oil,gas,cement,flaring,other,per capita,created_at,modified_at
199,Afghanistan,AFG,1949-01-01,0.014656,0.014656,0.0,0.0,0.0,0.0,,,2023-01-17,
200,Afghanistan,AFG,1950-01-01,0.084272,0.021068,0.063204,0.0,0.0,0.0,,0.011266,2023-01-17,
201,Afghanistan,AFG,1951-01-01,0.0916,0.025648,0.065952,0.0,0.0,0.0,,0.012098,2023-01-17,
202,Afghanistan,AFG,1952-01-01,0.0916,0.031708,0.059892,0.0,0.0,0.0,,0.011946,2023-01-17,
203,Afghanistan,AFG,1953-01-01,0.106256,0.037949,0.068307,0.0,0.0,0.0,,0.013685,2023-01-17,
204,Afghanistan,AFG,1954-01-01,0.106256,0.042502,0.063754,0.0,0.0,0.0,,0.013511,2023-01-17,
205,Afghanistan,AFG,1955-01-01,0.153888,0.062288,0.0916,0.0,0.0,0.0,,0.019304,2023-01-17,
206,Afghanistan,AFG,1956-01-01,0.1832,0.062288,0.120912,0.0,0.0,0.0,,0.022652,2023-01-17,
207,Afghanistan,AFG,1957-01-01,0.29312,0.076944,0.216176,0.0,0.0,0.0,,0.035702,2023-01-17,
208,Afghanistan,AFG,1958-01-01,0.32976,0.0916,0.23816,0.0,0.0,0.0,,0.039569,2023-01-17,


In [48]:
from hashlib import md5

cols = ['country_code', 'year']
# first define a generic function that returns the md4 hash for
# any combination of values
def get_hash(*cols) -> str:
    """return the md5 hash of all parameters"""
    value = '-'.join([str(x) for x in cols])
    return md5(value.encode(encoding='utf-16')).hexdigest()

logger.info(f"assigning country_code_year: using md5 hash of country_code and year")

# customer_id = md5 hash of customer first and last name
df['country_code_year'] = df.apply(lambda row: get_hash(row.country_code, row.year), axis=1)
# set index by customer_id
df = df.set_index(keys='country_code_year')

logger.info(f"country_code_year generated")
display(df)

[INFO ][2023-01-17 16:05:03,070][1071718710:0011] : assigning country_code_year: using md5 hash of country_code and year
[INFO ][2023-01-17 16:05:04,495][1071718710:0018] : country_code_year generated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,country,country_code,year,total,coal,oil,gas,cement,flaring,other,per capita,created_at,modified_at
country_code_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
c1ed0abf9cc4dcfecfab890d4f7fa700,Afghanistan,AFG,1949-01-01,0.014656,0.014656,0.000000,0.000000,0.000000,0.000000,,,2023-01-17,
2b3739259d014dffcbc449f9dd831531,Afghanistan,AFG,1950-01-01,0.084272,0.021068,0.063204,0.000000,0.000000,0.000000,,0.011266,2023-01-17,
2332b6677d052376305241dbfec01c35,Afghanistan,AFG,1951-01-01,0.091600,0.025648,0.065952,0.000000,0.000000,0.000000,,0.012098,2023-01-17,
9d38c408cc5871d07b35000df65b2d64,Afghanistan,AFG,1952-01-01,0.091600,0.031708,0.059892,0.000000,0.000000,0.000000,,0.011946,2023-01-17,
4b6b87ee7a7e3153c77923d6d1997814,Afghanistan,AFG,1953-01-01,0.106256,0.037949,0.068307,0.000000,0.000000,0.000000,,0.013685,2023-01-17,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59495ce6483e0a2b64ce8cf5bcad6f36,Global,WLD,2017-01-01,36096.739276,14506.973805,12242.627935,7144.928128,1507.923185,391.992176,302.294047,4.749682,2023-01-17,
bd367da414c7a24180f8bd4e8ba4e7ec,Global,WLD,2018-01-01,36826.506600,14746.830688,12266.016285,7529.846784,1569.218392,412.115746,302.478706,4.792753,2023-01-17,
1d4f24b12858fbb44ed6c372a36273a2,Global,WLD,2019-01-01,37082.558969,14725.978025,12345.653374,7647.528220,1617.506786,439.253991,306.638573,4.775633,2023-01-17,
1f47d6c452c897f05dc587313b67489d,Global,WLD,2020-01-01,35264.085734,14174.564010,11191.808551,7556.290283,1637.537532,407.583673,296.301685,4.497423,2023-01-17,


In [None]:
#Look up country_code from dim_country table