In [52]:
# installing necessary packages

#!pip install pandas azure-storage-blob dotenv

In [53]:
# importing necessary libraries
import pandas as pd
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
import os 

In [54]:
# Data Extraction
try:
    data = pd.read_csv('zipco_transaction.csv')
    print('Data Extracted successfully!')
except Exception as e:
    print(f"an error occured : {e}")

Data Extracted successfully!


In [None]:
data.info()

In [56]:
# remove duplicates
data.drop_duplicates(inplace=True)

In [57]:
# Handle missing values( filling missing numeric values with mean and median)
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    data.fillna({col: data[col].mean()},inplace=True)
    
    

In [None]:
# Handle missing values (where there is string/object with unknown)
string_column = data.select_dtypes(include=['object']).columns
for col in string_column:
    data.fillna({col: 'unknown'},inplace=True)

    data.info()

In [59]:
data['Date']= pd.to_datetime(data['Date'])

In [None]:
# Creating dimension tables 
products = data[['ProductName', 'UnitPrice']].drop_duplicates().reset_index(drop=True)
# add product ID to the table
products.index.name = 'Product_ID'
products = products.reset_index()
products.head(15)

In [None]:
# Customers table
customers = data[['CustomerName', 'CustomerAddress','Customer_PhoneNumber','CustomerEmail']].drop_duplicates().reset_index(drop=True)
# add product ID to the table
customers.index.name = 'Customer_ID'
# make product id the first column on the table
customers = customers.reset_index()
customers.head(15)

In [None]:
# Staff table
Staff = data[['Staff_Name', 'Staff_Email']].drop_duplicates().reset_index(drop=True)
# add product ID to the table
Staff.index.name = 'Staff_ID'
Staff = Staff.reset_index()
Staff.head(15)

In [None]:
data.columns

In [64]:
# Transactions Table
Transaction = data.merge(products, on=['ProductName', 'UnitPrice'],how = 'left')\
                  .merge(customers, on=['CustomerName', 'CustomerAddress','Customer_PhoneNumber','CustomerEmail'],how = 'left')\
                  .merge(Staff, on=['Staff_Name', 'Staff_Email',],how = 'left')
Transaction.index.name = 'Transaction_ID'
Transaction = Transaction.reset_index()\
                         [['Date','Transaction_ID','Product_ID','Customer_ID','Staff_ID','Quantity','OrderType',\
                           'StoreLocation','PaymentType', 'PromotionApplied', 'Weather', 'Temperature',\
                           'StaffPerformanceRating', 'CustomerFeedback', 'DeliveryTime_min', 'DayOfWeek','TotalSales']]

In [66]:
# Save data as csv
data.to_csv('clean_data.csv', index= False)
Transaction.to_csv('Transaction.csv', index= False)
products.to_csv('products.csv', index= False)
customers.to_csv('customers.csv', index= False)
Staff.to_csv('Staff.csv', index= False)

In [67]:
# Data loading to azure
# load the environment variables from .env
load_dotenv()
connect_str = os.getenv('CONNECT_STR')
container_name = os.getenv('CONTAINER_NAME')

In [None]:
# Create Blobservice client object

blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client(container_name)

# load data into azure blob storage
files = [
    (data, 'rawdata/clean_zipco_Transaction_data.csv'), # creating a folder inside the created contanier
    (products, 'cleandata/products.csv'),
    (customers, 'cleandata/customers.csv'),
    (Staff, 'cleandata/Staff.csv'),
    (Transaction, 'cleandata/Transaction.csv'),
]

for file, blob_name in files:
    blob_client = container_client.get_blob_client(blob_name)
    output = file.to_csv(index=False)
    blob_client.upload_blob(output, overwrite=True)
    print(f'{blob_name} loaded into azure storage')