In [1]:
'''
Multidimensional Retail Data Centralization
-------
This script extracts data, cleans and uploads it all to designated tables in sales_data database.
Methods have been predefined so variables are set to organize the extraction, cleaning and uploading process for each set of data.

Contents
-------
    * Imports 
    * Data Extraction Details - assigning urls/links, endpoints and api-keys necessary to run the code
    * Classes Usage - instance created and initialized and variables assigned to each class

    These cell blocks perform the following: Data Extraction, Transformation and Loading to sales_data
    * Legacy Users
    * Card Data
    * Business Store Data
    * Products Data
    * Orders Table
    * Sales Date Times

'''

'\nMultidimensional Retail Data Centralization\n-------\nThis script extracts data, cleans and uploads it all to designated tables in sales_data database.\nMethods have been predefined so variables are set to organize the extraction, cleaning and uploading process for each set of data.\n\nContents\n-------\n    * Imports \n    * Data Extraction Details - assigning urls/links, endpoints and api-keys necessary to run the code\n    * Classes Usage - instance created and initialized and variables assigned to each class\n\n    These cell blocks perform the following: Data Extraction, Transformation and Loading to sales_data\n    * Legacy Users\n    * Card Data\n    * Business Store Data\n    * Products Data\n    * Orders Table\n    * Sales Date Times\n\n'

In [2]:
#Imports
from database_utils import DatabaseConnector
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from data_processing import DataProcessor

In [3]:
#Data Extraction Details
creds_path = '/Users/itsanya/AiCore/MRDC/db_creds.yaml'

pdf_link = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'

b_store_api_key = 'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'
number_of_stores_endpoint = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
store_endpoint = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{store_number}'

s3_address = 's3://data-handling-public/products.csv' 

s3_url = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json'

In [4]:
#Classes Usage
db_connector_instance = DatabaseConnector(creds_path) 
data_extractor = DataExtractor(db_connector_instance) 
data_cleaner = DataCleaning()
data_processor = DataProcessor()

In [5]:
#Legacy Users
#Extract
table_name = 'legacy_users'
users_df = data_extractor.read_rds_table(table_name)

#Clean
fully_cleaned_users_df = data_cleaner.clean_users_df(users_df)

#Upload to dim_users
db_connector_instance._upload_to_db(df=fully_cleaned_users_df, table_name='dim_users')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].apply(lambda address: str(address).replace('\n', ' ') if pd.notna(address) else address)


Data uploaded to 'dim_users' table successfully.


In [6]:
#Card Data
#Extract
card_df = data_extractor._retrieve_pdf_data(pdf_link)

#Clean
fully_cleaned_card_df = data_cleaner.clean_card_data(card_df)

#Upload to dim_card_details
db_connector_instance._upload_to_db(df=fully_cleaned_card_df, table_name='dim_card_details')

Error importing jpype dependencies. Fallback to subprocess.
No module named 'jpype'


DateParseError: day is out of range for month: 02-29, at position 0

In [None]:
#Business Store Data
#Extract
data_extractor.set_api_key(b_store_api_key)

number_of_stores = data_extractor.list_number_of_stores(number_of_stores_endpoint)

b_store_df = data_extractor.retrieve_stores_data(store_endpoint, number_of_stores)

#Clean
fully_cleaned_b_store_df = data_cleaner.clean_store_data(b_store_df)

#Upload to dim_store_details
db_connector_instance._upload_to_db(df=fully_cleaned_b_store_df, table_name='dim_store_details')

In [None]:
#Products Data
#Extract
prods_df = data_extractor.extract_from_s3(s3_address)

#Clean
cleaned_prod_w = data_cleaner.convert_product_weights(prods_df)
fully_cleaned_prods_df = data_cleaner.clean_products_data(cleaned_prod_w)

#Upload to dim_store_details
db_connector_instance._upload_to_db(df=fully_cleaned_prods_df, table_name='dim_products')

In [None]:
#Orders Table
#Extract
table_name = 'orders_table'
orders_df = data_extractor.read_rds_table(table_name)

#Clean
fully_cleaned_orders_df = data_cleaner.clean_orders_df(orders_df)

#Upload to orders_table
db_connector_instance._upload_to_db(df=fully_cleaned_orders_df, table_name='orders_table')

In [None]:
#Sales Date Times
#Extract
sdt_df = data_extractor.extract_sdt(s3_url)

#Clean
fully_cleaned_sdt_df = data_cleaner.clean_sdt_df(sdt_df)

#Upload to dim_date_times
db_connector_instance._upload_to_db(df=fully_cleaned_sdt_df, table_name='dim_date_times')