# ETL Quality Checking
A notebook to check the result from ETL query process

Scripts :
* dim_populate.sql
* fct_populate.sql

## Notebook Sanity Check

In [15]:
from dotenv import load_dotenv
import os

# OR, the same with increased verbosity
load_dotenv(verbose=True)

DBNAME= "ecommerce"
HOSTNAME= "localhost"
USER= "postgres"
PASS= "9923"

In [16]:
print(HOSTNAME)

localhost


In [17]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600);

# Connect to PostgreSQL server
conn  = alchemyEngine.connect();

In [18]:
# Showing Missing Data
def show_missing_data(df):
    print(f"Shape : {df.shape}")
    print(f"Missing Data : {df.isnull().sum()}")
    return None

In [19]:
# Global var
schema = "warehouse"

## Date dimension

In [20]:
date = pd.read_sql_table("dim_date", conn, schema=schema)
date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3651 entries, 0 to 3650
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_id                 3651 non-null   object        
 1   date                    3651 non-null   datetime64[ns]
 2   day_name                3651 non-null   object        
 3   day_of_week             3651 non-null   int64         
 4   day_of_month            3651 non-null   int64         
 5   day_of_quarter          3651 non-null   int64         
 6   day_of_year             3651 non-null   float64       
 7   week_of_month           3651 non-null   int64         
 8   week_of_year            3651 non-null   float64       
 9   month_actual            3651 non-null   float64       
 10  month_name              3651 non-null   object        
 11  month_name_abbreviated  3651 non-null   object        
 12  quarter                 3651 non-null   float64 

In [21]:
show_missing_data(date)

Shape : (3651, 15)
Missing Data : date_id                   0
date                      0
day_name                  0
day_of_week               0
day_of_month              0
day_of_quarter            0
day_of_year               0
week_of_month             0
week_of_year              0
month_actual              0
month_name                0
month_name_abbreviated    0
quarter                   0
year                      0
isWeekend                 0
dtype: int64


In [22]:
date.head(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
0,20150101,2015-01-01,Thursday,4,1,1,1.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
1,20150102,2015-01-02,Friday,5,2,2,2.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
2,20150103,2015-01-03,Saturday,6,3,3,3.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
3,20150104,2015-01-04,Sunday,7,4,4,4.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
4,20150105,2015-01-05,Monday,1,5,5,5.0,1,2.0,1.0,January,Jan,1.0,2015.0,False


In [23]:
date.tail(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
3646,20241225,2024-12-25,Wednesday,3,25,86,360.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3647,20241226,2024-12-26,Thursday,4,26,87,361.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3648,20241227,2024-12-27,Friday,5,27,88,362.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3649,20241228,2024-12-28,Saturday,6,28,89,363.0,4,52.0,12.0,December,Dec,4.0,2024.0,True
3650,20241229,2024-12-29,Sunday,7,29,90,364.0,5,52.0,12.0,December,Dec,4.0,2024.0,True


## User dimension

In [24]:
user = pd.read_sql_table("dim_user", conn, schema=schema)
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96096 entries, 0 to 96095
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             96096 non-null  int64  
 1   user_name           96096 non-null  object 
 2   total_order         96096 non-null  float64
 3   total_spending      96096 non-null  float64
 4   is_current_version  96096 non-null  bool   
dtypes: bool(1), float64(2), int64(1), object(1)
memory usage: 3.0+ MB


In [25]:
show_missing_data(user)

# Notes : There are a few user (1113 to be exact) that haven't done any transaction in the span of the data being collected

Shape : (96096, 5)
Missing Data : user_id               0
user_name             0
total_order           0
total_spending        0
is_current_version    0
dtype: int64


In [26]:
user.head()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
0,1,00053a61a98854899e70ed204dd4bafe,1.0,419180.0,True
1,2,0005e1862207bf6ccc02e4228effd9a0,1.0,150120.0,True
2,3,00090324bbad0e9342388303bb71ba0a,1.0,63660.0,True
3,4,000bfa1d2f1a41876493be685390d6d3,1.0,46850.0,True
4,5,000c8bdb58a29e7115cfc257230fb21b,1.0,29000.0,True


In [27]:
user.tail()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
96091,96092,1b32669eb9662ee904419de883e59a58,1.0,0.0,True
96092,96093,5942dde582a33e31ea4471bc5363b0f3,1.0,0.0,True
96093,96094,daba2e7a00c149161c68cbb18db656a9,1.0,0.0,True
96094,96095,7d373e92dd3086b4c37e9868fc8999c1,1.0,0.0,True
96095,96096,c1653f4d5fcfb808bee93bb3e1aa4744,1.0,0.0,True


## Product dimension

In [28]:
product = pd.read_sql_table("dim_product", conn, schema=schema)
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id_surr             32951 non-null  int64  
 1   product_id                  32951 non-null  object 
 2   product_category            32328 non-null  object 
 3   product_name_length         32341 non-null  float64
 4   product_description_length  32341 non-null  float64
 5   product_photos_qty          32341 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_weight_g            32949 non-null  float64
 8   product_height_cm           32949 non-null  float64
 9   product_width_cm            32949 non-null  float64
 10  is_current_version          32951 non-null  bool   
dtypes: bool(1), float64(7), int64(1), object(2)
memory usage: 2.5+ MB


In [29]:
# There are some missing values from the data
show_missing_data(product)

Shape : (32951, 11)
Missing Data : product_id_surr                 0
product_id                      0
product_category              623
product_name_length           610
product_description_length    610
product_photos_qty            610
product_length_cm               2
product_weight_g                2
product_height_cm               2
product_width_cm                2
is_current_version              0
dtype: int64


In [30]:
# Notes : All data in 3 columns (product_name_length, product_description_length, product_photos_qty) product dimension that were missing are same 
product.loc[product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,106,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,129,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,146,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,155,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,198,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,32516,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,32590,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,32617,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,32773,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


In [31]:
# Notes : All data in 3 columns (product_length_cm, product_weight_g, product_width_cm) product dimension that were missing are same 
product.loc[product.product_length_cm.isna() & product.product_weight_g.isna() & product.product_width_cm.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
8595,8585,09ff539a621711667c43eba6a3bd8466,baby,60.0,865.0,3.0,,,,,True
18857,18855,5eb564652db742ff8f28759cd8d2652a,,,,,,,,,True


In [32]:
# Notes : All data with missing category have missing previous detected missing cols -> might create new cols for understanding business problem
product.loc[product.product_category.isna() | (product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna())]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,106,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,129,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,146,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,155,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,198,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,32516,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,32590,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,32617,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,32773,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


## Seller dimension

In [33]:
seller = pd.read_sql_table("dim_seller", conn, schema=schema)
seller.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   seller_id_surr      3095 non-null   int64 
 1   seller_id           3095 non-null   object
 2   seller_zip_code     3095 non-null   int64 
 3   seller_city         3095 non-null   object
 4   seller_state        3095 non-null   object
 5   is_current_version  3095 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 124.0+ KB


In [34]:
show_missing_data(seller)

Shape : (3095, 6)
Missing Data : seller_id_surr        0
seller_id             0
seller_zip_code       0
seller_city           0
seller_state          0
is_current_version    0
dtype: int64


In [35]:
# TODO : Add num_product_order & total_revenue in data mart for seller subject -> combine with fct_order_items
seller.head(5)

Unnamed: 0,seller_id_surr,seller_id,seller_zip_code,seller_city,seller_state,is_current_version
0,1,3442f8959a84dea7ee197c632cb2df15,13023,KOTA JAKARTA TIMUR,DKI JAKARTA,True
1,2,d1b65fc7debc3361ea86b5f14c68d2e2,13844,KOTA PADANG PANJANG,SUMATERA BARAT,True
2,3,ce3ad9de960102d0677a81f5d0bb7b2d,20031,KOTA JAKARTA BARAT,DKI JAKARTA,True
3,4,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,KOTA TANGERANG,BANTEN,True
4,5,51a04a8a6bdcb23deccc82b0b80742cf,12914,KABUPATEN LAMONGAN,JAWA TIMUR,True


## Feedback dimension

In [36]:
feedback = pd.read_sql_table("dim_feedback", conn, schema=schema)
feedback.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   feedback_id_surr         99441 non-null  int64  
 1   order_id                 99441 non-null  object 
 2   feedback_avg_score       99441 non-null  float64
 3   feedback_form_sent_date  99441 non-null  object 
 4   feedback_answer_date     99441 non-null  object 
 5   is_current_version       99441 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 3.9+ MB


In [37]:
# Notes : All feedback in the dimension have been measured based on each order (under asumption that an order should have a single feedback, and multiple feedback means multiple sent form for a single order -> aggregated)

show_missing_data(feedback)

Shape : (99441, 6)
Missing Data : feedback_id_surr           0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version         0
dtype: int64


In [38]:
feedback.head()

Unnamed: 0,feedback_id_surr,order_id,feedback_avg_score,feedback_form_sent_date,feedback_answer_date,is_current_version
0,1,00010242fe8c5a6d1ba2dd792cb16214,5.0,20170921,20170922,True
1,2,00018f77f2f0320c557190d7a144bdd3,4.0,20170513,20170515,True
2,3,000229ec398224ef6ca0657da4fc703e,5.0,20180123,20180123,True
3,4,00024acbcdf0a6daa1e931b038114c75,4.0,20180815,20180815,True
4,5,00042b26cf59d7ce69dfabb4e55b4fd9,5.0,20170302,20170303,True


## Payment Facts
---
Granularity : Each payment (payment installment / cicilan) that user pay after transaction  

In [39]:
fct_payment = pd.read_sql_table("fct_payment", conn, schema=schema)
fct_payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    103886 non-null  int64  
 1   feedback_id_surr      103886 non-null  int64  
 2   user_id               103886 non-null  int64  
 3   payment_sequential    103886 non-null  int64  
 4   payment_type          103886 non-null  object 
 5   payment_installments  103886 non-null  int64  
 6   payment_value         103886 non-null  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 5.5+ MB


In [40]:
show_missing_data(fct_payment)

Shape : (103886, 7)
Missing Data : id                      0
feedback_id_surr        0
user_id                 0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64


### Check Reference

In [41]:
# Check connection to the feedback table
merged = pd.merge(fct_payment, feedback, how="inner", on="feedback_id_surr")
merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 103886 entries, 0 to 103885
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       103886 non-null  int64  
 1   feedback_id_surr         103886 non-null  int64  
 2   user_id                  103886 non-null  int64  
 3   payment_sequential       103886 non-null  int64  
 4   payment_type             103886 non-null  object 
 5   payment_installments     103886 non-null  int64  
 6   payment_value            103886 non-null  float64
 7   order_id                 103886 non-null  object 
 8   feedback_avg_score       103886 non-null  float64
 9   feedback_form_sent_date  103886 non-null  object 
 10  feedback_answer_date     103886 non-null  object 
 11  is_current_version       103886 non-null  bool   
dtypes: bool(1), float64(2), int64(5), object(4)
memory usage: 9.6+ MB


In [42]:
show_missing_data(merged)

Shape : (103886, 12)
Missing Data : id                         0
feedback_id_surr           0
user_id                    0
payment_sequential         0
payment_type               0
payment_installments       0
payment_value              0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version         0
dtype: int64


In [43]:
merged = pd.merge(merged, user, how="inner", on="user_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103886 entries, 0 to 103885
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       103886 non-null  int64  
 1   feedback_id_surr         103886 non-null  int64  
 2   user_id                  103886 non-null  int64  
 3   payment_sequential       103886 non-null  int64  
 4   payment_type             103886 non-null  object 
 5   payment_installments     103886 non-null  int64  
 6   payment_value            103886 non-null  float64
 7   order_id                 103886 non-null  object 
 8   feedback_avg_score       103886 non-null  float64
 9   feedback_form_sent_date  103886 non-null  object 
 10  feedback_answer_date     103886 non-null  object 
 11  is_current_version_x     103886 non-null  bool   
 12  user_name                103886 non-null  object 
 13  total_order              103886 non-null  float64
 14  tota

In [44]:
show_missing_data(merged)
# missing total_spending due to the problem 

Shape : (103886, 16)
Missing Data : id                         0
feedback_id_surr           0
user_id                    0
payment_sequential         0
payment_type               0
payment_installments       0
payment_value              0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version_x       0
user_name                  0
total_order                0
total_spending             0
is_current_version_y       0
dtype: int64


## Order items (Transaction) Facts

In [45]:
fct_order_item = pd.read_sql_table("fct_order_items", conn, schema=schema)
fct_order_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id                 112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [46]:
show_missing_data(fct_order_item)

Shape : (112650, 16)
Missing Data : id                            0
user_id                       0
product_id_surr               0
seller_id_surr                0
feedback_id_surr              0
order_date                    0
order_approved_date          15
pickup_date                1194
delivered_date             2454
estimated_time_delivery       0
pickup_limit_date             0
order_id                      0
item_number                   0
order_item_status             0
price                         0
shipping_cost                 0
dtype: int64


In [47]:
# Why there are missing order_approved_date ?? -> chance is, order haven't approved by the user when this data is recorded -> can see that order id is mostly 2017 02 -> what happen?
fct_order_item.loc[fct_order_item.order_approved_date.isna()]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
20851,20852,44954,14173,1532,43549,20170218,,20170222,20170301,20170317,20170222,7013bcfc1c97fe719a7b5e05e61c12db,1,delivered,49990.0,15530.0
36129,36130,28250,18257,2108,83474,20170219,,20170223,20170302,20170327,20170226,d69e5d356402adc8cf17e08b5033acfb,1,delivered,149800.0,13630.0
52156,52157,90377,23444,2695,83803,20170218,,20170223,20170302,20170322,20170225,d77031d6a3c8a52f019764e68f211c69,1,delivered,28990.0,10960.0
60427,60428,65865,8786,339,23313,20170217,,20170222,20170303,20170323,20170222,3c0b8706b065f9919d0505d3b3343881,1,delivered,133990.0,23200.0
62385,62386,82545,12616,2313,75275,20170119,,20170125,20170130,20170301,20170123,c1d4211b3dae76144deccd6c74144a88,1,delivered,39990.0,14520.0
65554,65555,85019,20767,84,87256,20170218,,20170223,20170301,20170317,20170222,e04abd8149ef81b95221e88f6ed9ab6a,1,delivered,309900.0,39110.0
72888,72889,66601,6240,2208,7281,20170217,,20170222,20170302,20170320,20170221,12a95a3c06dbaec84bcfb0e2da5d228a,1,delivered,79990.0,15770.0
72903,72904,22976,6240,2208,16817,20170218,,20170222,20170303,20170331,20170222,2babbb4b15e6d2dfe95e2de765c97bce,1,delivered,79990.0,26820.0
72983,72984,59737,6240,2208,36016,20170218,,20170222,20170309,20170331,20170222,5cf925b116421afa85ee25e99b4c34fb,1,delivered,79990.0,26820.0
76852,76853,61471,15359,1999,53652,20170218,,20170223,20170302,20170321,20170222,8a9adc69528e1001fc68dd0aaebbb54a,1,delivered,379000.0,17860.0


In [48]:
fct_order_item.sort_values(by="order_date", ascending=False).head()

# Conc : missing data cause is not latest data recorded ? -> Then what? Need more exploration

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
79314,79315,71731,21745,2630,32700,20180903,20180903,20180904,,20180906,20180905,54282e97f61c23b78330c15b154c867d,1,shipped,145000.0,21460.0
104761,104762,75554,15041,282,20789,20180829,20180829,20180829,20180830.0,20180905,20180831,35a972d7f8436f405b56e36add1a7140,1,delivered,84990.0,8760.0
64413,64414,40849,18685,798,97583,20180829,20180829,20180829,20180830.0,20180904,20180831,fb393211459aac00af932cd7ab4fa2cc,1,delivered,99000.0,7950.0
33759,33760,88481,11637,3076,74107,20180829,20180829,20180829,20180830.0,20180911,20180906,bee12e8653a04e76786e8891cfb6330a,2,delivered,91550.0,7900.0
33762,33763,88481,11637,3076,74107,20180829,20180829,20180829,20180830.0,20180911,20180906,bee12e8653a04e76786e8891cfb6330a,5,delivered,91550.0,7900.0


In [49]:
# Why is pickup_date is missing ? -> Is it because of order status? 
fct_order_item.loc[fct_order_item.pickup_date.isna()].head()

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
148,149,94056,292,1020,22290,20171212,20171212,,,20180110,20171221,3971a8d658593643c85790cb67105832,1,processing,190000.0,15080.0
165,166,40400,342,1522,32888,20180806,20180807,,,20180823,20180813,54a33eb07a24da26f790d99fa2adfd7c,1,invoiced,60030.0,15520.0
311,312,24754,710,386,82423,20161005,20161006,,,20161209,20161021,d3c8851a6651eeff2f73b0e011ac45d0,1,processing,267000.0,32840.0
662,663,7587,884,2890,95470,20170219,20170219,,,20170321,20170223,f5cf5716413185387030a378bdd46ebe,1,processing,99000.0,15900.0
1020,1021,47973,1495,1536,14697,20170711,20170712,,,20170814,20170726,25fd9ab410ed941187c41224aaecd049,1,canceled,240000.0,39520.0


In [50]:
fct_order_item.loc[fct_order_item.pickup_date.isna() & (~fct_order_item.delivered_date.isna())]

# Weird -> how can the product is being delivered, but never being pickup? -> most of the time, the product need to be picked up by the delivery service -> then deliverd

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
49463,49464,39119,15909,2375,16426,20170929,20170929,,20171120,20171114,20171018,2aa91108853cecb43c84a5dc5b277475,1,delivered,179000.0,14980.0


In [51]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna())]
# There are many case where the product is being pickup but not delivered ?? -> Why though

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
246,247,54857,605,1912,68967,20180326,20180326,20180403,,20180418,20180330,b1f6abbaec3261269b9ecc0f895f8c2d,1,shipped,39990.0,23850.0
313,314,6038,727,2596,17793,20170121,20170121,20170209,,20170314,20170125,2e22dc2fce65e5b9d73a11d717f41724,1,shipped,54990.0,26640.0
399,400,92489,795,243,61996,20180225,20180227,20180227,,20180322,20180305,a05fc37f43df506f00a32f1507e4d360,1,shipped,128890.0,15650.0
442,443,8078,795,243,11021,20180425,20180425,20180430,,20180601,20180502,1c57efb710fa0df4433b44a5c39e4b6e,1,shipped,99900.0,0.0
666,667,46507,912,280,85130,20180709,20180710,20180711,,20180725,20180713,daf3d34986ea6ba8b098570a1951677f,1,shipped,154000.0,16180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112261,112262,89020,31851,333,54325,20180406,20180407,20180409,,20180502,20180412,8c6eb35c6bb5e1a9dbf00531ab9404e3,1,shipped,12300.0,19040.0
112280,112281,71134,31912,1137,95841,20180305,20180305,20180308,,20180402,20180309,f6cc5ac0be9196f2fb3db04bace5cd8b,1,shipped,199700.0,38950.0
112286,112287,58073,31962,2148,95175,20180716,20180717,20180718,,20180726,20180723,f50b9192b9705d537f9593b595da132f,1,shipped,49000.0,7600.0
112365,112366,70599,32082,1814,14289,20170612,20170612,20170613,,20170626,20170616,24da4d9782584b90078fc30af8df32ad,1,shipped,18900.0,7780.0


In [52]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "shipped")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
246,247,54857,605,1912,68967,20180326,20180326,20180403,,20180418,20180330,b1f6abbaec3261269b9ecc0f895f8c2d,1,shipped,39990.0,23850.0
313,314,6038,727,2596,17793,20170121,20170121,20170209,,20170314,20170125,2e22dc2fce65e5b9d73a11d717f41724,1,shipped,54990.0,26640.0
399,400,92489,795,243,61996,20180225,20180227,20180227,,20180322,20180305,a05fc37f43df506f00a32f1507e4d360,1,shipped,128890.0,15650.0
442,443,8078,795,243,11021,20180425,20180425,20180430,,20180601,20180502,1c57efb710fa0df4433b44a5c39e4b6e,1,shipped,99900.0,0.0
666,667,46507,912,280,85130,20180709,20180710,20180711,,20180725,20180713,daf3d34986ea6ba8b098570a1951677f,1,shipped,154000.0,16180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112261,112262,89020,31851,333,54325,20180406,20180407,20180409,,20180502,20180412,8c6eb35c6bb5e1a9dbf00531ab9404e3,1,shipped,12300.0,19040.0
112280,112281,71134,31912,1137,95841,20180305,20180305,20180308,,20180402,20180309,f6cc5ac0be9196f2fb3db04bace5cd8b,1,shipped,199700.0,38950.0
112286,112287,58073,31962,2148,95175,20180716,20180717,20180718,,20180726,20180723,f50b9192b9705d537f9593b595da132f,1,shipped,49000.0,7600.0
112365,112366,70599,32082,1814,14289,20170612,20170612,20170613,,20170626,20170616,24da4d9782584b90078fc30af8df32ad,1,shipped,18900.0,7780.0


In [53]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "canceled")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
1643,1644,71861,2930,2108,45617,20180207,20180207,20180210,,20180308,20180213,7582b8c10039cceafcfe7b13e41a91e2,1,canceled,24900.0,15100.0
1752,1753,23835,3156,1772,27564,20180129,20180129,20180131,,20180223,20180204,46eb75885eca03736c4188871c0a9ba3,1,canceled,170000.0,18440.0
5423,5424,71927,11502,1685,61352,20180220,20180220,20180227,,20180312,20180226,9ec3685b1f21c7dfff4f656392b186dd,1,canceled,89580.0,17340.0
6793,6794,47940,15244,2464,94702,20180126,20180126,20180126,,20180219,20180201,f3c5d914eec90079a826c2b5ea7fe6bc,1,canceled,59900.0,12760.0
7729,7730,71816,17524,1324,41687,20180130,20180130,20180131,,20180222,20180205,6b5ccf5385890198c54fa97afe4812bc,1,canceled,349490.0,28770.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107773,107774,23814,22847,1780,65263,20180207,20180208,20180209,,20180309,20180214,a8a3eaaf155bc111acb10e0536a9e753,1,canceled,69900.0,16930.0
110950,110951,95846,29466,1254,78438,20180205,20180205,20180206,,20180301,20180209,c9f06c1229ec7cd4ec9aed55d434b499,1,canceled,150000.0,12550.0
111019,111020,72016,29466,1254,47484,20180128,20180128,20180129,,20180302,20180201,7a56ded9b696cca5e892f56f49b1921a,1,canceled,150000.0,17490.0
111524,111525,23887,30589,2956,54419,20180214,20180214,20180217,,20180307,20180220,8caf289ab5d7e3ec7b4522aa22dabbec,1,canceled,49990.0,14100.0


In [54]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status != "canceled") & (fct_order_item.order_item_status != "shipped")]

# Delivered but never recorded in delivered date -> might use the estimated_time_delivery (remaining 7 records)

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
13438,13439,50133,30579,1809,12775,20180627,20180627,20180703,,20180719,20180703,20edc82cf5400ce95e1afacc25798b31,1,delivered,45900.0,9070.0
15324,15325,43057,1772,139,66407,20180608,20180608,20180612,,20180626,20180618,ab7c89dc1bf4a1ead9d6ec1ec8968a84,1,delivered,110990.0,9130.0
37295,37296,4381,20236,2383,95491,20180620,20180620,20180625,,20180716,20180626,f5dd62b788049ad9fc0526e3ad11a097,1,delivered,329000.0,25240.0
41362,41363,25806,29655,1685,17372,20171128,20171128,20171130,,20171218,20171204,2d1e2d5bf4dc7227b3bfebb81328c15f,1,delivered,117300.0,17530.0
43452,43453,53587,2966,2961,89705,20180701,20180701,20180703,,20180730,20180705,e69f75a717d64fc5ecdfae42b2e8e086,1,delivered,139000.0,19070.0
43455,43456,26617,2966,2961,18042,20180701,20180701,20180703,,20180730,20180705,2ebdfc4f15f23b91474edf87475f108e,1,delivered,139000.0,19070.0
80622,80623,21874,25466,2082,5169,20180701,20180701,20180703,,20180724,20180705,0d3268bad9b086af767785e3f0fc0133,1,delivered,188990.0,15630.0


### Check reference

In [55]:
merged = pd.merge(fct_order_item, user, how="inner", on="user_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id                 112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [56]:
merged = pd.merge(merged,feedback, how="inner", on="feedback_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id_x               112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [57]:
merged = pd.merge(merged,product, how="inner", on="product_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   order_date                  112650 non-null  object 
 6   order_approved_date         112635 non-null  object 
 7   pickup_date                 111456 non-null  object 
 8   delivered_date              110196 non-null  object 
 9   estimated_time_delivery     112650 non-null  object 
 10  pickup_limit_date           112650 non-null  object 
 11  order_id_x                  112650 non-null  object 
 12  item_number                 112650 non-null  int64  
 13  order_item_sta

In [58]:
merged = pd.merge(merged,seller, how="inner", on="seller_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   order_date                  112650 non-null  object 
 6   order_approved_date         112635 non-null  object 
 7   pickup_date                 111456 non-null  object 
 8   delivered_date              110196 non-null  object 
 9   estimated_time_delivery     112650 non-null  object 
 10  pickup_limit_date           112650 non-null  object 
 11  order_id_x                  112650 non-null  object 
 12  item_number                 112650 non-null  int64  
 13  order_item_sta