# Transforming data between bronze and silver layers - Data Lake

### Configuring access to the Minio/S3 object repository

In [None]:
! pip install -r ../requirements.txt

In [17]:
from toml import load
from minio import Minio
from io import BytesIO
import pandas as pd

with open("credentials.toml", "r") as toml_file:
    credentials = load(toml_file)

access_key = credentials.get("minio_credentials").get("access_key")
secret_key = credentials.get("minio_credentials").get("secret_key")
url_minio = credentials.get("minio_credentials").get("url_minio")

minio_client = Minio(
    url_minio,
    access_key=access_key,
    secret_key=secret_key,
    secure=False,
    
) 


### Importing bronze layer data for processing - CUSTOMER

In [40]:
bucket_name = "bronze"

objects = minio_client.list_objects(bucket_name, prefix="customer/")

list_dataframes = []

# Import all files in the bucket

for obj in objects:
    file_name = obj.object_name

    data = minio_client.get_object(bucket_name, file_name)

    tmp_dataframe = pd.read_csv(BytesIO(data.read()), sep=";")

    list_dataframes.append(tmp_dataframe)

customers_dataframe = pd.concat(list_dataframes, ignore_index=True)

del list_dataframes
del tmp_dataframe

In [41]:
customers_dataframe

Unnamed: 0,customer_id,client_name,email_address,phone_number,registration_date,date_of_birth,sex,address,city,state,customer_category
0,1,Cobbie Crackett,ccrackett0@scientificamerican.com,833-662-1463,2021-03-13,1996-10-08,Male,58 Florence Way,Madīnat Ḩamad,,new
1,2,Rianon Jaquet,rjaquet1@xinhuanet.com,153-712-5740,2018-11-08,1972-11-11,Female,478 Anniversary Junction,Balakhta,,regular
2,3,Upton Driffill,udriffill2@go.com,734-247-9429,2010-05-06,2003-04-05,Male,8 Dexter Road,Shancheng,,VIP
3,4,Corenda Withringten,cwithringten3@stumbleupon.com,695-445-0470,2016-07-30,1998-10-03,Female,86 Bobwhite Hill,Little Current,Ontario,new
4,5,Brett Heinish,bheinish4@squarespace.com,755-447-4251,2020-06-13,1978-10-20,Male,4037 Melrose Park,Oxelösund,Södermanland,VIP
...,...,...,...,...,...,...,...,...,...,...,...
995,996,Doralia Strapp,dstrapprn@google.cn,638-664-1281,2019-12-22,1995-09-17,Female,200 Loftsgordon Circle,Paris 17,Île-de-France,regular
996,997,Gran Putland,gputlandro@foxnews.com,349-393-4818,2011-02-16,1969-12-08,Male,6692 Sherman Road,Zorgo,,new
997,998,Danila Issard,dissardrp@rediff.com,974-675-2181,2013-08-16,1994-08-27,Genderqueer,7641 Porter Pass,Tubod,,new
998,999,Goober Tanton,gtantonrq@discovery.com,979-634-0457,2014-10-30,1979-03-10,Male,589 Rusk Way,Daming,,new


### Importing bronze layer data for processing - SALES

In [37]:
bucket_name = "bronze"

objects = minio_client.list_objects(bucket_name, prefix="sales/")

list_dataframes = []

# Import all files in the bucket

for obj in objects:
    file_name = obj.object_name

    data = minio_client.get_object(bucket_name, file_name)

    tmp_dataframe = pd.read_csv(BytesIO(data.read()), sep=";")

    list_dataframes.append(tmp_dataframe)

sales_dataframe = pd.concat(list_dataframes, ignore_index=True)

del list_dataframes
del tmp_dataframe

In [38]:
sales_dataframe

Unnamed: 0,sale_id,customer_id,sale_date,sale_time,sale_value,product_sold,payment_method,sale_status
0,1,358,2023-06-08,16:03,754.26,product2,credit_card,completed
1,2,153,2023-04-26,11:27,2919.10,product1,cash,pending
2,3,257,2023-07-16,1:20,5483.67,product2,cash,cancelled
3,4,392,2023-05-14,7:30,9524.27,product2,credit_card,cancelled
4,5,22,2023-07-05,16:57,2452.25,product1,cash,pending
...,...,...,...,...,...,...,...,...
995,996,320,2023-01-29,1:53,2016.03,product1,paypal,cancelled
996,997,266,2023-07-16,7:05,6200.33,product1,paypal,cancelled
997,998,169,2023-08-25,11:56,6068.44,product3,cash,completed
998,999,15,2023-06-02,4:14,7731.17,product1,credit_card,pending


### Importing bronze layer data for processing - LOGS

In [35]:
bucket_name = "bronze"

objects = minio_client.list_objects(bucket_name, prefix="logs/")

list_dataframes = []

# Import all files in the bucket

for obj in objects:
    file_name = obj.object_name

    data = minio_client.get_object(bucket_name, file_name)

    tmp_dataframe = pd.read_csv(BytesIO(data.read()), sep=";")

    list_dataframes.append(tmp_dataframe)

logs_dataframe = pd.concat(list_dataframes, ignore_index=True)

del list_dataframes
del tmp_dataframe

In [39]:
logs_dataframe

Unnamed: 0,log_id,customer_id,access_date,access_time,pages_visited,device,session_duration,source_ip
0,1,406,2022-12-14,15:11,products,tablet,2601,49.249.185.98
1,2,52,2023-05-30,7:50,products,desktop,518,97.145.73.147
2,3,366,2023-06-18,20:21,home page,tablet,3367,246.108.79.188
3,4,387,2023-10-06,12:21,policies,desktop,1057,178.72.69.237
4,5,478,2023-05-25,22:08,home page,mobile,2983,100.145.156.36
...,...,...,...,...,...,...,...,...
995,996,302,2023-10-10,8:47,home page,mobile,2365,219.115.33.237
996,997,451,2023-05-04,22:26,terms of use,mobile,3005,229.118.98.150
997,998,165,2023-01-13,20:52,home page,mobile,2895,62.181.127.188
998,999,129,2023-10-22,14:01,terms of use,desktop,1644,170.189.233.93


# Preparation of the data transformation environment

In [42]:
import duckdb

connection = duckdb.connect(database=":memory:", read_only=False)

# Registering dataframes as tables in DuckDB

connection.register("tb_customers", customers_dataframe)
connection.register("tb_sales", sales_dataframe)
connection.register("tb_logs", logs_dataframe)

<duckdb.duckdb.DuckDBPyConnection at 0x7f9e45989030>

### Query execution test

In [51]:
query = """
SET memory_limit='10GB';
-- SELECT * FROM tb_customers;
-- USE memory;
--  SHOW TABLES;
"""
result_query = connection.execute(query).df()
result_query

Unnamed: 0,name
0,tb_customers
1,tb_logs
2,tb_sales


### Transformation of the sales base

In [57]:
query = (
"""
SELECT  customer_id,
        count(*) count_sales,
        min(sale_datetime) frist_sale_datetime,
        max(sale_datetime) last_sale_datetime,
        sum(sale_value) amount,
        sum(amount_sold) as amount_sold,
        sum(amount_pending) as amount_pending,
        sum(amount_cancelled) as amount_cancelled,
        string_agg(product_and_status, ', ') list_of_products_and_status
FROM (
    SELECT  customer_id,
            cast(concat(sale_date, ' ', lpad(sale_time, 5, '0'), ':00') as timestamp) sale_datetime,
            sale_value,
            case when sale_status = 'completed' then sale_value else 0.00 end as amount_sold,
            case when sale_status = 'pending' then sale_value else 0.00 end as amount_pending,
            case when sale_status = 'cancelled' then sale_value else 0.00 end as amount_cancelled,
            concat(product_sold, '-', sale_status) product_and_status
    FROM tb_sales
    --WHERE customer_id = 487
    )
GROUP BY customer_id
ORDER BY 2 DESC;
"""
)

result_transformation_sales = connection.execute(query).df()
result_transformation_sales

Unnamed: 0,customer_id,count_sales,frist_sale_datetime,last_sale_datetime,amount,amount_sold,amount_pending,amount_cancelled,list_of_products_and_status
0,229,8,2023-01-26 11:39:00,2023-12-19 12:33:00,36080.35,3631.60,26282.18,6166.57,"product1-pending, product3-pending, product1-c..."
1,261,6,2023-02-22 16:45:00,2023-12-20 06:12:00,34968.75,10179.20,15878.94,8910.61,"product2-completed, product1-pending, product1..."
2,168,6,2023-01-21 08:10:00,2023-11-04 00:02:00,18987.86,9586.09,8627.86,773.91,"product1-completed, product2-pending, product3..."
3,20,6,2023-02-05 08:24:00,2023-10-08 00:06:00,36094.49,177.53,9246.19,26670.77,"product2-pending, product1-cancelled, product2..."
4,304,6,2023-03-15 14:02:00,2023-10-21 23:48:00,37194.21,0.00,27403.99,9790.22,"product2-pending, product3-pending, product3-p..."
...,...,...,...,...,...,...,...,...,...
426,417,1,2023-11-15 00:14:00,2023-11-15 00:14:00,7027.41,0.00,0.00,7027.41,product3-cancelled
427,135,1,2023-08-24 11:31:00,2023-08-24 11:31:00,9179.96,0.00,0.00,9179.96,product3-cancelled
428,313,1,2023-10-06 07:36:00,2023-10-06 07:36:00,3159.38,0.00,0.00,3159.38,product2-cancelled
429,408,1,2023-11-22 17:59:00,2023-11-22 17:59:00,8650.60,8650.60,0.00,0.00,product3-completed


### Transformation of the logs base

In [None]:
query = (
"""
SELECT  ...
"""
)

result_transformation_logs = connection.execute(query).df()
result_transformation_logs

### Consolidation of customer bases, sales and logs into an OBT - One Big Table for silver tier

In [None]:
# Registering dataframes as new tables in DuckDB

connection.register("tb_transformation_sales", result_transformation_sales)
connection.register("tb_transformation_logs", result_transformation_logs)

In [None]:
query = (
"""
SELECT  ...
"""
)

result_consolidation_obt = connection.execute(query).df()
result_consolidation_obt