In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning
import numpy as np
import pandas as pd

In [2]:
db_raw = DatabaseConnector('db_creds_raw.yaml')
raw_data = DataExtractor("db_creds_raw.yaml")

In [3]:
tables_list = raw_data.db_connector.list_db_tables()
print(f'Tables in raw data: {tables_list}')

Tables in raw data: ['dim_card_details', 'legacy_store_details', 'legacy_users', 'orders_table']


In [4]:
user_data = raw_data.read_rds_table('legacy_users')
card_data = raw_data.read_rds_table('dim_card_details')
store_data = raw_data.read_rds_table('legacy_store_details')
product_data = raw_data.extract_from_s3(url_path = 's3://data-handling-public/products.csv', new_file_name = 'product_data')
orders_table = raw_data.read_rds_table('orders_table')
date_details = raw_data.extract_from_s3(url_path = 's3://data-handling-public/date_details.json', new_file_name = 'date_details')


In [5]:
cleaner = DataCleaning()
user_data_cleaned = cleaner.clean_user_data(user_data)
card_data_cleaned = cleaner.clean_card_data(card_data)
store_data_cleaned = cleaner.clean_store_data(store_data)
product_data_cleaned = cleaner.clean_products_data(product_data)
orders_table_cleaned = cleaner.clean_orders_data(orders_table)
date_details_cleaned = cleaner.clean_dates_details(date_details)

In [8]:
date_details_cleaned

Unnamed: 0,timestamp,month,year,day,time_period,date_uuid
0,22:00:06,9,2012,19,Evening,3b7ca996-37f9-433f-b6d0-ce8391b615ad
1,22:44:06,2,1997,10,Evening,adc86836-6c35-49ca-bb0d-65b6507a00fa
2,10:05:37,4,1994,15,Morning,5ff791bf-d8e0-4f86-8ceb-c7b60bef9b31
3,17:29:27,11,2001,6,Midday,1b01fcef-5ab9-404c-b0d4-1e75a0bd19d8
4,22:40:33,12,2015,31,Evening,dfa907c1-f6c5-40f0-aa0d-40ed77ac5a44
...,...,...,...,...,...,...
120156,22:56:56,11,2022,12,Evening,d6c4fb31-720d-4e94-aa6b-dcbcb85f2bb7
120157,18:25:20,5,1997,31,Evening,f7722027-1aae-49c3-8f8d-853e93f9f3e6
120158,18:21:40,9,2011,13,Evening,4a3b9851-52e1-463c-ac81-1960f141444e
120159,19:10:53,7,2013,12,Evening,64974909-0d4b-42a2-822a-73b5695e8bfb


In [9]:
db_cleaned = DatabaseConnector('db_creds_cleaned.yaml')
db_cleaned.upload_to_db(user_data_cleaned, table_name='dim_users', if_exists='replace')
db_cleaned.upload_to_db(card_data_cleaned, table_name='dim_card_details', if_exists='replace')
db_cleaned.upload_to_db(store_data_cleaned, table_name='dim_store_details', if_exists='replace')
db_cleaned.upload_to_db(product_data_cleaned, table_name='dim_products', if_exists='replace')
db_cleaned.upload_to_db(orders_table_cleaned, table_name='orders_table', if_exists='replace')
db_cleaned.upload_to_db(date_details_cleaned, table_name='dim_date_times', if_exists='replace')

db_cleaned.list_db_tables()

['orders_table',
 'dim_date_times',
 'dim_users',
 'dim_card_details',
 'dim_store_details',
 'dim_products']