# Transforming data between bronze and silver layers - Data Lake

### Configuring access to the Minio/S3 object repository

In [None]:
! pip install -r requirements.txt

In [2]:
from toml import load
from minio import Minio
from io import BytesIO
import pandas as pd


with open("credentials-ex.toml", "r") as toml_file:
    credentials = load(toml_file)

access_key = credentials.get("minio_credentials").get("access_key")
secret_key = credentials.get("minio_credentials").get("secret_key")
url_minio = credentials.get("minio_credentials").get("url_minio")

minio_client = Minio(
    url_minio,
    access_key=access_key,
    secret_key=secret_key,
    secure=False,
    
) 


def import_objects_bucket(bucket_name: str, path: str, sep: str=";") -> object:
    """
    Import all files in the bucket
    """
    objects = minio_client.list_objects(bucket_name, prefix=f"{path}/")

    list_dataframes = []

    for obj in objects:
        file_name = obj.object_name

        data = minio_client.get_object(bucket_name, file_name)

        tmp_dataframe = pd.read_csv(BytesIO(data.read()), sep=sep)

        list_dataframes.append(tmp_dataframe)

    return pd.concat(list_dataframes, ignore_index=True)


### Importing bronze layer data for processing - CUSTOMER

In [7]:
customers_dataframe = import_objects_bucket("bronze", "customer")

In [8]:
customers_dataframe

Unnamed: 0,customer_id,client_name,email_address,phone_number,registration_date,date_of_birth,sex,address,city,state,customer_category
0,1,Cobbie Crackett,ccrackett0@scientificamerican.com,833-662-1463,2021-03-13,1996-10-08,Male,58 Florence Way,Madīnat Ḩamad,,new
1,2,Rianon Jaquet,rjaquet1@xinhuanet.com,153-712-5740,2018-11-08,1972-11-11,Female,478 Anniversary Junction,Balakhta,,regular
2,3,Upton Driffill,udriffill2@go.com,734-247-9429,2010-05-06,2003-04-05,Male,8 Dexter Road,Shancheng,,VIP
3,4,Corenda Withringten,cwithringten3@stumbleupon.com,695-445-0470,2016-07-30,1998-10-03,Female,86 Bobwhite Hill,Little Current,Ontario,new
4,5,Brett Heinish,bheinish4@squarespace.com,755-447-4251,2020-06-13,1978-10-20,Male,4037 Melrose Park,Oxelösund,Södermanland,VIP
...,...,...,...,...,...,...,...,...,...,...,...
995,996,Doralia Strapp,dstrapprn@google.cn,638-664-1281,2019-12-22,1995-09-17,Female,200 Loftsgordon Circle,Paris 17,Île-de-France,regular
996,997,Gran Putland,gputlandro@foxnews.com,349-393-4818,2011-02-16,1969-12-08,Male,6692 Sherman Road,Zorgo,,new
997,998,Danila Issard,dissardrp@rediff.com,974-675-2181,2013-08-16,1994-08-27,Genderqueer,7641 Porter Pass,Tubod,,new
998,999,Goober Tanton,gtantonrq@discovery.com,979-634-0457,2014-10-30,1979-03-10,Male,589 Rusk Way,Daming,,new


### Importing bronze layer data for processing - SALES

In [9]:
sales_dataframe = import_objects_bucket("bronze", "sales")

In [10]:
sales_dataframe

Unnamed: 0,sale_id,customer_id,sale_date,sale_time,sale_value,product_sold,payment_method,sale_status
0,1,358,2023-06-08,16:03,754.26,product2,credit_card,completed
1,2,153,2023-04-26,11:27,2919.10,product1,cash,pending
2,3,257,2023-07-16,1:20,5483.67,product2,cash,cancelled
3,4,392,2023-05-14,7:30,9524.27,product2,credit_card,cancelled
4,5,22,2023-07-05,16:57,2452.25,product1,cash,pending
...,...,...,...,...,...,...,...,...
995,996,320,2023-01-29,1:53,2016.03,product1,paypal,cancelled
996,997,266,2023-07-16,7:05,6200.33,product1,paypal,cancelled
997,998,169,2023-08-25,11:56,6068.44,product3,cash,completed
998,999,15,2023-06-02,4:14,7731.17,product1,credit_card,pending


### Importing bronze layer data for processing - LOGS

In [12]:
logs_dataframe = import_objects_bucket("bronze", "logs")

In [13]:
logs_dataframe

Unnamed: 0,log_id,customer_id,access_date,access_time,pages_visited,device,session_duration,source_ip
0,1,406,2022-12-14,15:11,products,tablet,2601,49.249.185.98
1,2,52,2023-05-30,7:50,products,desktop,518,97.145.73.147
2,3,366,2023-06-18,20:21,home page,tablet,3367,246.108.79.188
3,4,387,2023-10-06,12:21,policies,desktop,1057,178.72.69.237
4,5,478,2023-05-25,22:08,home page,mobile,2983,100.145.156.36
...,...,...,...,...,...,...,...,...
995,996,302,2023-10-10,8:47,home page,mobile,2365,219.115.33.237
996,997,451,2023-05-04,22:26,terms of use,mobile,3005,229.118.98.150
997,998,165,2023-01-13,20:52,home page,mobile,2895,62.181.127.188
998,999,129,2023-10-22,14:01,terms of use,desktop,1644,170.189.233.93


# Preparation of the data transformation environment

In [14]:
import duckdb

#connection = duckdb.connect(database=":memory:", read_only=False)
connection = duckdb.connect(database="data-lake.db", read_only=False)

# Registering dataframes as tables in DuckDB

connection.register("tb_customers", customers_dataframe)
connection.register("tb_sales", sales_dataframe)
connection.register("tb_logs", logs_dataframe)

<duckdb.duckdb.DuckDBPyConnection at 0x7f457ebb3330>

### Query execution test

In [15]:
query = """
SET memory_limit='10GB';
-- SELECT * FROM tb_customers;
-- USE memory;
--  SHOW TABLES;
"""
result_query = connection.execute(query).df()
result_query

Unnamed: 0,Success


### Transformation of the sales base

In [16]:
query = (
"""
SELECT  customer_id,
        count(*) count_sales,
        min(sale_datetime) frist_sale_datetime,
        max(sale_datetime) last_sale_datetime,
        sum(sale_value) amount,
        sum(amount_sold) as amount_sold,
        sum(amount_pending) as amount_pending,
        sum(amount_cancelled) as amount_cancelled,
        string_agg(product_and_status, ', ') list_of_products_and_status
FROM (
    SELECT  customer_id,
            cast(concat(sale_date, ' ', lpad(sale_time, 5, '0'), ':00') as timestamp) sale_datetime,
            sale_value,
            case when sale_status = 'completed' then sale_value else 0.00 end as amount_sold,
            case when sale_status = 'pending' then sale_value else 0.00 end as amount_pending,
            case when sale_status = 'cancelled' then sale_value else 0.00 end as amount_cancelled,
            concat(product_sold, '-', sale_status) product_and_status
    FROM tb_sales
    --WHERE customer_id = 487
    )
GROUP BY customer_id
ORDER BY 2 DESC;
"""
)

result_transformation_sales = connection.execute(query).df()
result_transformation_sales

Unnamed: 0,customer_id,count_sales,frist_sale_datetime,last_sale_datetime,amount,amount_sold,amount_pending,amount_cancelled,list_of_products_and_status
0,229,8,2023-01-26 11:39:00,2023-12-19 12:33:00,36080.35,3631.60,26282.18,6166.57,"product1-pending, product3-pending, product1-c..."
1,261,6,2023-02-22 16:45:00,2023-12-20 06:12:00,34968.75,10179.20,15878.94,8910.61,"product2-completed, product1-pending, product1..."
2,168,6,2023-01-21 08:10:00,2023-11-04 00:02:00,18987.86,9586.09,8627.86,773.91,"product1-completed, product2-pending, product3..."
3,20,6,2023-02-05 08:24:00,2023-10-08 00:06:00,36094.49,177.53,9246.19,26670.77,"product2-pending, product1-cancelled, product2..."
4,304,6,2023-03-15 14:02:00,2023-10-21 23:48:00,37194.21,0.00,27403.99,9790.22,"product2-pending, product3-pending, product3-p..."
...,...,...,...,...,...,...,...,...,...
426,417,1,2023-11-15 00:14:00,2023-11-15 00:14:00,7027.41,0.00,0.00,7027.41,product3-cancelled
427,135,1,2023-08-24 11:31:00,2023-08-24 11:31:00,9179.96,0.00,0.00,9179.96,product3-cancelled
428,313,1,2023-10-06 07:36:00,2023-10-06 07:36:00,3159.38,0.00,0.00,3159.38,product2-cancelled
429,408,1,2023-11-22 17:59:00,2023-11-22 17:59:00,8650.60,8650.60,0.00,0.00,product3-completed


### Transformation of the logs base

In [30]:
query = (
"""
-- SELECT * FROM logs_dataframe;

SELECT  customer_id,
        min(cast(concat(access_date, ' ', lpad(access_time, 5, '0'), ':00') as datetime)) frist_access,
        max(cast(concat(access_date, ' ', lpad(access_time, 5, '0'), ':00') as datetime)) last_access,
        string_agg(concat(replace(pages_visited, ' ', '_'), '-', device, '-', source_ip), ', ') list_of_access_origin,
        round((avg(session_duration) / 60), 0) avg_navigation_in_minutes
FROM logs_dataframe
GROUP BY customer_id;
"""
)

result_transformation_logs = connection.execute(query).df()
result_transformation_logs

Unnamed: 0,customer_id,frist_access,last_access,list_of_access_origin,avg_navigation_in_minutes
0,406,2022-12-14 15:11:00,2023-05-22 20:22:00,"products-tablet-49.249.185.98, policies-deskto...",33.0
1,52,2022-12-31 02:28:00,2023-10-28 05:12:00,"products-desktop-97.145.73.147, terms_of_use-m...",33.0
2,366,2023-01-15 04:50:00,2023-06-18 20:21:00,"home_page-tablet-246.108.79.188, terms_of_use-...",26.0
3,387,2022-11-13 18:56:00,2023-10-06 12:21:00,"policies-desktop-178.72.69.237, terms_of_use-t...",12.0
4,478,2023-05-06 22:45:00,2023-09-14 01:30:00,"home_page-mobile-100.145.156.36, terms_of_use-...",45.0
...,...,...,...,...,...
427,272,2022-11-18 21:30:00,2022-11-18 21:30:00,policies-desktop-128.182.24.44,25.0
428,45,2023-06-25 23:40:00,2023-06-25 23:40:00,home_page-desktop-165.22.148.224,21.0
429,426,2023-09-24 02:02:00,2023-09-24 02:02:00,policies-desktop-206.11.16.7,44.0
430,451,2023-05-04 22:26:00,2023-05-04 22:26:00,terms_of_use-mobile-229.118.98.150,50.0


### Consolidation of customer bases, sales and logs into an OBT - One Big Table for silver tier

In [31]:
# Registering dataframes as new tables in DuckDB

connection.register("tb_transformation_sales", result_transformation_sales)
connection.register("tb_transformation_logs", result_transformation_logs)

<duckdb.duckdb.DuckDBPyConnection at 0x7f457ebb3330>

In [38]:
query = (
"""
SELECT  t1.customer_id,
        t1.client_name,
        t1.email_address,
        t1.phone_number,
        t1.registration_date,
        t1.date_of_birth,
        t1.sex,
        t1.address,
        t1.city,
        coalesce(t1.state, 'Not Found') state,
        t1.customer_category,
        t2.count_sales,
        t2.frist_sale_datetime,
        t2.last_sale_datetime,
        t2.amount,
        t2.amount_sold,
        t2.amount_pending,
        t2.amount_cancelled,
        t2.list_of_products_and_status
FROM tb_customers t1
LEFT JOIN tb_transformation_sales t2
ON t1.customer_id = t2.customer_id
"""
)

result_consolidation_obt = connection.execute(query).df()
result_consolidation_obt

Unnamed: 0,customer_id,client_name,email_address,phone_number,registration_date,date_of_birth,sex,address,city,state,customer_category,count_sales,frist_sale_datetime,last_sale_datetime,amount,amount_sold,amount_pending,amount_cancelled,list_of_products_and_status
0,1,Cobbie Crackett,ccrackett0@scientificamerican.com,833-662-1463,2021-03-13,1996-10-08,Male,58 Florence Way,Madīnat Ḩamad,Not Found,new,3.0,2023-03-24 14:55:00,2023-12-30 06:07:00,12286.95,0.00,11483.89,803.06,"product2-pending, product3-cancelled, product3..."
1,2,Rianon Jaquet,rjaquet1@xinhuanet.com,153-712-5740,2018-11-08,1972-11-11,Female,478 Anniversary Junction,Balakhta,Not Found,regular,3.0,2023-04-23 06:22:00,2023-12-07 09:13:00,19542.87,0.00,14608.01,4934.86,"product1-pending, product3-cancelled, product2..."
2,3,Upton Driffill,udriffill2@go.com,734-247-9429,2010-05-06,2003-04-05,Male,8 Dexter Road,Shancheng,Not Found,VIP,2.0,2023-02-18 08:47:00,2023-10-23 21:51:00,7078.00,4251.60,0.00,2826.40,"product1-cancelled, product1-completed"
3,4,Corenda Withringten,cwithringten3@stumbleupon.com,695-445-0470,2016-07-30,1998-10-03,Female,86 Bobwhite Hill,Little Current,Ontario,new,2.0,2023-03-11 22:31:00,2023-05-18 09:10:00,18091.23,8156.11,9935.12,0.00,"product3-pending, product1-completed"
4,6,Towny Rive,trive5@dion.ne.jp,556-229-5063,2015-09-28,1961-03-20,Male,0 Evergreen Hill,Karangpeton,Not Found,regular,3.0,2023-02-04 07:44:00,2023-06-19 18:21:00,16663.00,0.00,7169.85,9493.15,"product2-cancelled, product1-cancelled, produc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Doralia Strapp,dstrapprn@google.cn,638-664-1281,2019-12-22,1995-09-17,Female,200 Loftsgordon Circle,Paris 17,Île-de-France,regular,,NaT,NaT,,,,,
996,997,Gran Putland,gputlandro@foxnews.com,349-393-4818,2011-02-16,1969-12-08,Male,6692 Sherman Road,Zorgo,Not Found,new,,NaT,NaT,,,,,
997,998,Danila Issard,dissardrp@rediff.com,974-675-2181,2013-08-16,1994-08-27,Genderqueer,7641 Porter Pass,Tubod,Not Found,new,,NaT,NaT,,,,,
998,999,Goober Tanton,gtantonrq@discovery.com,979-634-0457,2014-10-30,1979-03-10,Male,589 Rusk Way,Daming,Not Found,new,,NaT,NaT,,,,,
