# ETL Quality Checking
A notebook to check the result from ETL query process

Revision History : 
- Updated quality analysis for updated query (after hotfix for payment dimension)

Scripts :
* dim_populate.sql
* fct_populate.sql

## Notebook Sanity Check

In [55]:
from dotenv import load_dotenv
import os

# OR, the same with increased verbosity
load_dotenv(verbose=True)

DBNAME= "ecommerce"
HOSTNAME= "localhost"
USER= "postgres"
PASS= "9923"

In [56]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600);

# Connect to PostgreSQL server
conn  = alchemyEngine.connect();

In [57]:
# Showing Missing Data
def show_missing_data(df):
    print(f"Shape : {df.shape}")
    print(f"Missing Data : {df.isnull().sum()}")
    return None

In [58]:
# Global var
schema = "warehouse"

## Date dimension

In [59]:
date = pd.read_sql_table("dim_date", conn, schema=schema)
date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3651 entries, 0 to 3650
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_id                 3651 non-null   object        
 1   date                    3651 non-null   datetime64[ns]
 2   day_name                3651 non-null   object        
 3   day_of_week             3651 non-null   int64         
 4   day_of_month            3651 non-null   int64         
 5   day_of_quarter          3651 non-null   int64         
 6   day_of_year             3651 non-null   float64       
 7   week_of_month           3651 non-null   int64         
 8   week_of_year            3651 non-null   float64       
 9   month_actual            3651 non-null   float64       
 10  month_name              3651 non-null   object        
 11  month_name_abbreviated  3651 non-null   object        
 12  quarter                 3651 non-null   float64 

In [60]:
show_missing_data(date)

Shape : (3651, 15)
Missing Data : date_id                   0
date                      0
day_name                  0
day_of_week               0
day_of_month              0
day_of_quarter            0
day_of_year               0
week_of_month             0
week_of_year              0
month_actual              0
month_name                0
month_name_abbreviated    0
quarter                   0
year                      0
isWeekend                 0
dtype: int64


In [61]:
date.head(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
0,20150101,2015-01-01,Thursday,4,1,1,1.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
1,20150102,2015-01-02,Friday,5,2,2,2.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
2,20150103,2015-01-03,Saturday,6,3,3,3.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
3,20150104,2015-01-04,Sunday,7,4,4,4.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
4,20150105,2015-01-05,Monday,1,5,5,5.0,1,2.0,1.0,January,Jan,1.0,2015.0,False


In [62]:
date.tail(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
3646,20241225,2024-12-25,Wednesday,3,25,86,360.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3647,20241226,2024-12-26,Thursday,4,26,87,361.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3648,20241227,2024-12-27,Friday,5,27,88,362.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3649,20241228,2024-12-28,Saturday,6,28,89,363.0,4,52.0,12.0,December,Dec,4.0,2024.0,True
3650,20241229,2024-12-29,Sunday,7,29,90,364.0,5,52.0,12.0,December,Dec,4.0,2024.0,True


## User dimension

In [63]:
user = pd.read_sql_table("dim_user", conn, schema=schema)
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96096 entries, 0 to 96095
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             96096 non-null  int64  
 1   user_name           96096 non-null  object 
 2   total_order         96096 non-null  float64
 3   total_spending      96096 non-null  float64
 4   is_current_version  96096 non-null  bool   
dtypes: bool(1), float64(2), int64(1), object(1)
memory usage: 3.0+ MB


In [64]:
show_missing_data(user)

Shape : (96096, 5)
Missing Data : user_id               0
user_name             0
total_order           0
total_spending        0
is_current_version    0
dtype: int64


In [65]:
user.head()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
0,1,00053a61a98854899e70ed204dd4bafe,1.0,419180.0,True
1,2,0005e1862207bf6ccc02e4228effd9a0,1.0,150120.0,True
2,3,00090324bbad0e9342388303bb71ba0a,1.0,63660.0,True
3,4,000bfa1d2f1a41876493be685390d6d3,1.0,46850.0,True
4,5,000c8bdb58a29e7115cfc257230fb21b,1.0,29000.0,True


In [66]:
user.tail()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
96091,96092,1b32669eb9662ee904419de883e59a58,1.0,0.0,True
96092,96093,5942dde582a33e31ea4471bc5363b0f3,1.0,0.0,True
96093,96094,daba2e7a00c149161c68cbb18db656a9,1.0,0.0,True
96094,96095,7d373e92dd3086b4c37e9868fc8999c1,1.0,0.0,True
96095,96096,c1653f4d5fcfb808bee93bb3e1aa4744,1.0,0.0,True


Important Notes : 
* There are a few user (1113 to be exact) that haven't done any transaction in the span of the data being collected -> handled using total_spending == 0.0

## Product dimension

In [67]:
product = pd.read_sql_table("dim_product", conn, schema=schema)
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id_surr             32951 non-null  int64  
 1   product_id                  32951 non-null  object 
 2   product_category            32328 non-null  object 
 3   product_name_length         32341 non-null  float64
 4   product_description_length  32341 non-null  float64
 5   product_photos_qty          32341 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_weight_g            32949 non-null  float64
 8   product_height_cm           32949 non-null  float64
 9   product_width_cm            32949 non-null  float64
 10  is_current_version          32951 non-null  bool   
dtypes: bool(1), float64(7), int64(1), object(2)
memory usage: 2.5+ MB


In [68]:
show_missing_data(product)

Shape : (32951, 11)
Missing Data : product_id_surr                 0
product_id                      0
product_category              623
product_name_length           610
product_description_length    610
product_photos_qty            610
product_length_cm               2
product_weight_g                2
product_height_cm               2
product_width_cm                2
is_current_version              0
dtype: int64


In [69]:
product.loc[product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,106,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,129,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,146,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,155,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,198,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,32516,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,32590,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,32617,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,32773,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


In [70]:
product.loc[product.product_length_cm.isna() & product.product_weight_g.isna() & product.product_width_cm.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
8595,8585,09ff539a621711667c43eba6a3bd8466,baby,60.0,865.0,3.0,,,,,True
18857,18855,5eb564652db742ff8f28759cd8d2652a,,,,,,,,,True


In [71]:
product.loc[product.product_category.isna() | (product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna())]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,106,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,129,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,146,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,155,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,198,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,32516,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,32590,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,32617,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,32773,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


Important Notes : 
* All data that were missing (610 row) with 3 columns (product_name_length, product_description_length, product_photos_qty) that were missing are same row
* All data that were missing the category is missing name, desc and photos (623 product missing category) -> might check if either 623 ever been purchased in the data.
* All data in 3 columns (product_length_cm, product_weight_g, product_width_cm) product dimension that were missing are same (2 rows) -> dia ga seiris dengan missing product category


## Seller dimension

In [72]:
seller = pd.read_sql_table("dim_seller", conn, schema=schema)
seller.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   seller_id_surr      3095 non-null   int64 
 1   seller_id           3095 non-null   object
 2   seller_zip_code     3095 non-null   int64 
 3   seller_city         3095 non-null   object
 4   seller_state        3095 non-null   object
 5   is_current_version  3095 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 124.0+ KB


In [73]:
show_missing_data(seller)

Shape : (3095, 6)
Missing Data : seller_id_surr        0
seller_id             0
seller_zip_code       0
seller_city           0
seller_state          0
is_current_version    0
dtype: int64


In [74]:
seller.head()


Unnamed: 0,seller_id_surr,seller_id,seller_zip_code,seller_city,seller_state,is_current_version
0,1,3442f8959a84dea7ee197c632cb2df15,13023,KOTA JAKARTA TIMUR,DKI JAKARTA,True
1,2,d1b65fc7debc3361ea86b5f14c68d2e2,13844,KOTA PADANG PANJANG,SUMATERA BARAT,True
2,3,ce3ad9de960102d0677a81f5d0bb7b2d,20031,KOTA JAKARTA BARAT,DKI JAKARTA,True
3,4,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,KOTA TANGERANG,BANTEN,True
4,5,51a04a8a6bdcb23deccc82b0b80742cf,12914,KABUPATEN LAMONGAN,JAWA TIMUR,True


## Feedback dimension

In [75]:
feedback = pd.read_sql_table("dim_feedback", conn, schema=schema)
feedback.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   feedback_id_surr         99441 non-null  int64  
 1   order_id                 99441 non-null  object 
 2   feedback_avg_score       99441 non-null  float64
 3   feedback_form_sent_date  99441 non-null  object 
 4   feedback_answer_date     99441 non-null  object 
 5   is_current_version       99441 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 3.9+ MB


In [76]:
show_missing_data(feedback)

Shape : (99441, 6)
Missing Data : feedback_id_surr           0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version         0
dtype: int64


In [77]:
feedback.head()

Unnamed: 0,feedback_id_surr,order_id,feedback_avg_score,feedback_form_sent_date,feedback_answer_date,is_current_version
0,1,00010242fe8c5a6d1ba2dd792cb16214,5.0,20170921,20170922,True
1,2,00018f77f2f0320c557190d7a144bdd3,4.0,20170513,20170515,True
2,3,000229ec398224ef6ca0657da4fc703e,5.0,20180123,20180123,True
3,4,00024acbcdf0a6daa1e931b038114c75,4.0,20180815,20180815,True
4,5,00042b26cf59d7ce69dfabb4e55b4fd9,5.0,20170302,20170303,True


Important Notes : 
* Multiple feedback (on same order) have been measured based on each order (under asumption that an order should have a single feedback, and multiple feedback means multiple sent form for a single order -> aggregated)

## Payment Dimension 

In [78]:
payment = pd.read_sql_table("dim_payment", conn, schema=schema)
payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99440 entries, 0 to 99439
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   payment_id_surr            99440 non-null  int64  
 1   order_id                   99440 non-null  object 
 2   num_payment                99440 non-null  int64  
 3   total_payment_value        99440 non-null  float64
 4   total_payment_installment  99440 non-null  int64  
 5   num_credit_card            99440 non-null  int64  
 6   total_payment_credit_card  99440 non-null  float64
 7   num_blipay                 99440 non-null  int64  
 8   total_payment_blipay       99440 non-null  float64
 9   num_voucher                99440 non-null  int64  
 10  total_payment_voucher      99440 non-null  float64
 11  num_debit                  99440 non-null  int64  
 12  total_payment_debit        99440 non-null  float64
 13  num_unknown                99440 non-null  int

In [79]:
show_missing_data(payment)

Shape : (99440, 16)
Missing Data : payment_id_surr              0
order_id                     0
num_payment                  0
total_payment_value          0
total_payment_installment    0
num_credit_card              0
total_payment_credit_card    0
num_blipay                   0
total_payment_blipay         0
num_voucher                  0
total_payment_voucher        0
num_debit                    0
total_payment_debit          0
num_unknown                  0
total_payment_unknown        0
is_current_version           0
dtype: int64


In [80]:
payment.loc[payment.num_payment > 5].head(5)

Unnamed: 0,payment_id_surr,order_id,num_payment,total_payment_value,total_payment_installment,num_credit_card,total_payment_credit_card,num_blipay,total_payment_blipay,num_voucher,total_payment_voucher,num_debit,total_payment_debit,num_unknown,total_payment_unknown,is_current_version
215,216,009ac365164f8e06f59d18a08045f6c4,6,32000.0,6,1,880.0,0,0.0,5,31120.0,0,0.0,0,0.0,True
282,283,00c405bd71187154a7846862f585a9d4,7,46690.0,7,1,6030.0,0,0.0,6,40660.0,0,0.0,0,0.0,True
2952,2953,077d6b93c4e88d06b124cf24fb67a28e,6,104370.0,6,0,0.0,0,0.0,6,104370.0,0,0.0,0,0.0,True
4393,4394,0b398dbb3e7c81005f3c69b01488c4c9,9,107840.0,9,1,2320.0,0,0.0,8,105520.0,0,0.0,0,0.0,True
4418,4419,0b47f5e9432bd433f8c5cf64e60e6e5f,7,223540.0,9,1,103540.0,0,0.0,6,120000.0,0,0.0,0,0.0,True


## Order items (Transaction) Facts

In [81]:
fct_order_item = pd.read_sql_table("fct_order_items", conn, schema=schema)
fct_order_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   payment_id_surr          112647 non-null  float64
 6   order_date               112650 non-null  object 
 7   order_approved_date      112635 non-null  object 
 8   pickup_date              111456 non-null  object 
 9   delivered_date           110196 non-null  object 
 10  estimated_time_delivery  112650 non-null  object 
 11  pickup_limit_date        112650 non-null  object 
 12  order_id                 112650 non-null  object 
 13  item_number              112650 non-null  int64  
 14  orde

In [82]:
show_missing_data(fct_order_item)

Shape : (112650, 17)
Missing Data : id                            0
user_id                       0
product_id_surr               0
seller_id_surr                0
feedback_id_surr              0
payment_id_surr               3
order_date                    0
order_approved_date          15
pickup_date                1194
delivered_date             2454
estimated_time_delivery       0
pickup_limit_date             0
order_id                      0
item_number                   0
order_item_status             0
price                         0
shipping_cost                 0
dtype: int64


In [83]:
# Why there are missing order_approved_date ?? -> chance is, order haven't approved by the user when this data is recorded -> can see that order id is mostly 2017 02 -> what happen?
fct_order_item.loc[fct_order_item.order_approved_date.isna()]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
2393,2394,22976,6240,2208,16817,16817.0,20170218,,20170222,20170303,20170331,20170222,2babbb4b15e6d2dfe95e2de765c97bce,1,delivered,79990.0,26820.0
3365,3366,65865,8786,339,23313,23313.0,20170217,,20170222,20170303,20170323,20170222,3c0b8706b065f9919d0505d3b3343881,1,delivered,133990.0,23200.0
38674,38675,82545,12616,2313,75275,75274.0,20170119,,20170125,20170130,20170301,20170123,c1d4211b3dae76144deccd6c74144a88,1,delivered,39990.0,14520.0
39892,39893,28250,18257,2108,83474,83473.0,20170219,,20170223,20170302,20170327,20170226,d69e5d356402adc8cf17e08b5033acfb,1,delivered,149800.0,13630.0
48537,48538,44954,14173,1532,43549,43549.0,20170218,,20170222,20170301,20170317,20170222,7013bcfc1c97fe719a7b5e05e61c12db,1,delivered,49990.0,15530.0
54169,54170,90377,23444,2695,83803,83802.0,20170218,,20170223,20170302,20170322,20170225,d77031d6a3c8a52f019764e68f211c69,1,delivered,28990.0,10960.0
60828,60829,83440,31416,2108,31861,31861.0,20170218,,20170223,20170307,20170329,20170228,51eb2eebd5d76a24625b31c33dd41449,1,delivered,59900.0,17160.0
71390,71391,66601,6240,2208,7281,7281.0,20170217,,20170222,20170302,20170320,20170221,12a95a3c06dbaec84bcfb0e2da5d228a,1,delivered,79990.0,15770.0
87116,87117,8411,16839,200,18116,18116.0,20170217,,20170222,20170303,20170320,20170221,2eecb0d85f281280f79fa00f9cec1a95,1,delivered,135000.0,19230.0
90767,90768,91869,15256,2108,43526,43526.0,20170119,,20170127,20170206,20170316,20170129,7002a78c79c519ac54022d4f8a65e6e8,1,delivered,45900.0,14520.0


In [84]:
fct_order_item.sort_values(by="order_date", ascending=False).head()

# Conc : missing data cause is not latest data recorded ? -> Then what? Need more exploration

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
4659,4660,71731,21745,2630,32700,32700.0,20180903,20180903,20180904,,20180906,20180905,54282e97f61c23b78330c15b154c867d,1,shipped,145000.0,21460.0
85786,85787,35815,31814,1043,8712,8712.0,20180829,20180829,20180829,20180830.0,20180911,20180831,168626408cb32af0ffaf76711caae1dc,1,delivered,45900.0,15390.0
96368,96369,3073,7159,2808,83631,83630.0,20180829,20180829,20180829,20180830.0,20180904,20180903,d70442bc5e3cb7438da497cc6a210f80,1,delivered,6900.0,7390.0
67754,67755,363,13525,645,80970,80969.0,20180829,20180829,20180829,20180830.0,20180904,20180831,d03ca98f59480e7e76c71fa83ecd8fb6,1,delivered,109900.0,9520.0
11038,11039,46961,4859,2880,77829,77828.0,20180829,20180829,20180829,20180830.0,20180903,20180831,c84d88553f9878bf2c7ecda2eb211ece,1,delivered,65000.0,9210.0


In [85]:
# Why is pickup_date is missing ? -> Is it because of order status? 
fct_order_item.loc[fct_order_item.pickup_date.isna()].head()

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
50,51,17221,17863,2778,381,381.0,20170410,20170410,,,20170504,20170414,00ff0cf5583758e6964723e42f111bf4,1,canceled,154900.0,15250.0
73,74,31058,28650,817,523,523.0,20170131,20170202,,,20170309,20170204,015fb6b5f739788434fa690540f90f19,1,invoiced,143000.0,94980.0
74,75,31058,28650,817,523,523.0,20170131,20170202,,,20170309,20170204,015fb6b5f739788434fa690540f90f19,2,invoiced,143000.0,94980.0
315,316,71840,30632,39,2174,2174.0,20180218,20180220,,,20180306,20180226,0584a31bb6abbd3dcc27e2d464ac863d,1,canceled,12900.0,8720.0
344,345,72510,5795,216,2335,2335.0,20170628,20170628,,,20170720,20170704,05eb212cd5b01c76bdb5215a49df19b6,1,processing,37500.0,16110.0


In [86]:
fct_order_item.loc[fct_order_item.pickup_date.isna() & (~fct_order_item.delivered_date.isna())]

# Weird -> how can the product is being delivered, but never being pickup? -> most of the time, the product need to be picked up by the delivery service -> then deliverd

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
2323,2324,39119,15909,2375,16426,16426.0,20170929,20170929,,20171120,20171114,20171018,2aa91108853cecb43c84a5dc5b277475,1,delivered,179000.0,14980.0


In [87]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna())]
# There are many case where the product is being pickup but not delivered ?? -> Why though

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
27,28,8976,17696,2662,237,237.0,20180817,20180817,20180818,,20180918,20180821,00a99c50fdff7e36262caba33821875a,1,shipped,52990.0,22300.0
153,154,34404,19492,2760,1119,1119.0,20170323,20170323,20170328,,20170418,20170329,02ea547b6d2ee25305588fd50df58b46,1,shipped,23990.0,15650.0
156,157,43402,7849,1685,1133,1133.0,20171127,20171128,20171129,,20171211,20171204,02f30be57375dc610c40ca989d2385b7,1,shipped,69000.0,11730.0
161,162,64895,26949,2809,1178,1178.0,20180615,20180616,20180618,,20180713,20180620,03138c298bfe1ee7855eefc9442346a8,1,shipped,40000.0,19470.0
354,355,16059,6415,1717,2439,2439.0,20161007,20161007,20161030,,20161201,20161021,063b573b88fc80e516aba87df524f809,1,shipped,69900.0,17630.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112196,112197,51063,2993,614,96408,96407.0,20180626,20180629,20180704,,20180723,20180705,f8385fa26ee1a262ff80d5a3181a63e5,1,shipped,34000.0,23000.0
112215,112216,95820,27751,1066,96517,96516.0,20180609,20180609,20180611,,20180626,20180615,f8812e9ef9494484aad51ddcd52ed382,1,shipped,49980.0,7390.0
112456,112457,87966,26097,1497,98113,98112.0,20180313,20180313,20180313,,20180323,20180319,fc947e5f82e5354dd3d971c9dbfabf93,1,shipped,69900.0,8880.0
112480,112481,78300,8297,2747,98254,98253.0,20180329,20180329,20180402,,20180507,20180404,fceed72ef6fde5b43c8bcefbdb223edc,1,shipped,139000.0,54720.0


In [88]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "shipped")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
27,28,8976,17696,2662,237,237.0,20180817,20180817,20180818,,20180918,20180821,00a99c50fdff7e36262caba33821875a,1,shipped,52990.0,22300.0
153,154,34404,19492,2760,1119,1119.0,20170323,20170323,20170328,,20170418,20170329,02ea547b6d2ee25305588fd50df58b46,1,shipped,23990.0,15650.0
156,157,43402,7849,1685,1133,1133.0,20171127,20171128,20171129,,20171211,20171204,02f30be57375dc610c40ca989d2385b7,1,shipped,69000.0,11730.0
161,162,64895,26949,2809,1178,1178.0,20180615,20180616,20180618,,20180713,20180620,03138c298bfe1ee7855eefc9442346a8,1,shipped,40000.0,19470.0
354,355,16059,6415,1717,2439,2439.0,20161007,20161007,20161030,,20161201,20161021,063b573b88fc80e516aba87df524f809,1,shipped,69900.0,17630.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112196,112197,51063,2993,614,96408,96407.0,20180626,20180629,20180704,,20180723,20180705,f8385fa26ee1a262ff80d5a3181a63e5,1,shipped,34000.0,23000.0
112215,112216,95820,27751,1066,96517,96516.0,20180609,20180609,20180611,,20180626,20180615,f8812e9ef9494484aad51ddcd52ed382,1,shipped,49980.0,7390.0
112456,112457,87966,26097,1497,98113,98112.0,20180313,20180313,20180313,,20180323,20180319,fc947e5f82e5354dd3d971c9dbfabf93,1,shipped,69900.0,8880.0
112480,112481,78300,8297,2747,98254,98253.0,20180329,20180329,20180402,,20180507,20180404,fceed72ef6fde5b43c8bcefbdb223edc,1,shipped,139000.0,54720.0


In [89]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "canceled")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
1666,1667,71812,9453,2379,11633,11633.0,20171221,20171223,20180130,,20180220,20180129,1def7af24cb7182497acfd953f6dd88b,1,canceled,244690.0,54310.0
2472,2473,48019,4511,1133,17325,17325.0,20180219,20180219,20180220,,20180315,20180223,2cfc79d9582e9135c0a9b61fa60e6b21,1,canceled,59900.0,19660.0
2607,2608,47872,8297,2747,18306,18306.0,20180120,20180120,20180122,,20180220,20180125,2f6492ebb2badd06db511b63ae7d7f55,1,canceled,149000.0,40370.0
4003,4004,96050,1659,2130,27959,27959.0,20180121,20180122,20180201,,20180219,20180126,47e96b5cb6f0592d336b6bf8a817d1c3,1,canceled,120900.0,16610.0
5007,5008,47829,1514,1043,35182,35182.0,20180128,20180128,20180201,,20180220,20180201,5abf37ab57e23f69a9c8ab98c4800273,1,canceled,158900.0,15870.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104227,104228,47828,31022,2346,39929,39929.0,20180205,20180205,20180207,,20180312,20180214,66dbde751178d7674bd09f18b6d25006,1,canceled,79000.0,38100.0
104468,104469,95994,31738,473,41477,41477.0,20180125,20180125,20180126,,20180223,20180131,6ad336dba0613cde0e5717f50dc8c669,1,canceled,69900.0,16930.0
109101,109102,72027,27887,1549,74150,74150.0,20180123,20180123,20180124,,20180222,20180129,bef875448d5a4f703555d2dd5f8382b5,1,canceled,139900.0,19220.0
109818,109819,23815,1494,1043,78999,78998.0,20180130,20180130,20180201,,20180222,20180205,cb599c234fc71be7c137179b6d473a30,1,canceled,35900.0,15100.0


In [90]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status != "canceled") & (fct_order_item.order_item_status != "shipped")]

# Delivered but never recorded in delivered date -> might use the estimated_time_delivery (remaining 7 records)

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
9442,9443,43057,1772,139,66407,66407.0,20180608,20180608,20180612,,20180626,20180618,ab7c89dc1bf4a1ead9d6ec1ec8968a84,1,delivered,110990.0,9130.0
12701,12702,53587,2966,2961,89705,89704.0,20180701,20180701,20180703,,20180730,20180705,e69f75a717d64fc5ecdfae42b2e8e086,1,delivered,139000.0,19070.0
16585,16586,26617,2966,2961,18042,18042.0,20180701,20180701,20180703,,20180730,20180705,2ebdfc4f15f23b91474edf87475f108e,1,delivered,139000.0,19070.0
27419,27420,4381,20236,2383,95491,95490.0,20180620,20180620,20180625,,20180716,20180626,f5dd62b788049ad9fc0526e3ad11a097,1,delivered,329000.0,25240.0
28725,28726,21874,25466,2082,5169,5169.0,20180701,20180701,20180703,,20180724,20180705,0d3268bad9b086af767785e3f0fc0133,1,delivered,188990.0,15630.0
72831,72832,25806,29655,1685,17372,17372.0,20171128,20171128,20171130,,20171218,20171204,2d1e2d5bf4dc7227b3bfebb81328c15f,1,delivered,117300.0,17530.0
100462,100463,50133,30579,1809,12775,12775.0,20180627,20180627,20180703,,20180719,20180703,20edc82cf5400ce95e1afacc25798b31,1,delivered,45900.0,9070.0


In [91]:
# Why there are missing payment reference -> All same order id -> payment not recorded?
fct_order_item.loc[fct_order_item.payment_id_surr.isna()]


Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,payment_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
27996,27997,36127,9840,1717,74449,,20160915,20160915,20161107,20161109,20161004,20160919,bfbd0f9bdef84302105ad712db648a6c,1,delivered,44990.0,2830.0
27997,27998,36127,9840,1717,74449,,20160915,20160915,20161107,20161109,20161004,20160919,bfbd0f9bdef84302105ad712db648a6c,2,delivered,44990.0,2830.0
27998,27999,36127,9840,1717,74449,,20160915,20160915,20161107,20161109,20161004,20160919,bfbd0f9bdef84302105ad712db648a6c,3,delivered,44990.0,2830.0


### Check reference

In [92]:
merged = pd.merge(fct_order_item, user, how="inner", on="user_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   payment_id_surr          112647 non-null  float64
 6   order_date               112650 non-null  object 
 7   order_approved_date      112635 non-null  object 
 8   pickup_date              111456 non-null  object 
 9   delivered_date           110196 non-null  object 
 10  estimated_time_delivery  112650 non-null  object 
 11  pickup_limit_date        112650 non-null  object 
 12  order_id                 112650 non-null  object 
 13  item_number              112650 non-null  int64  
 14  orde

In [93]:
merged = pd.merge(merged,feedback, how="inner", on="feedback_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   payment_id_surr          112647 non-null  float64
 6   order_date               112650 non-null  object 
 7   order_approved_date      112635 non-null  object 
 8   pickup_date              111456 non-null  object 
 9   delivered_date           110196 non-null  object 
 10  estimated_time_delivery  112650 non-null  object 
 11  pickup_limit_date        112650 non-null  object 
 12  order_id_x               112650 non-null  object 
 13  item_number              112650 non-null  int64  
 14  orde

In [94]:
merged = pd.merge(merged,product, how="inner", on="product_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 36 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   payment_id_surr             112647 non-null  float64
 6   order_date                  112650 non-null  object 
 7   order_approved_date         112635 non-null  object 
 8   pickup_date                 111456 non-null  object 
 9   delivered_date              110196 non-null  object 
 10  estimated_time_delivery     112650 non-null  object 
 11  pickup_limit_date           112650 non-null  object 
 12  order_id_x                  112650 non-null  object 
 13  item_number   

In [95]:
merged = pd.merge(merged,seller, how="inner", on="seller_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 41 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   payment_id_surr             112647 non-null  float64
 6   order_date                  112650 non-null  object 
 7   order_approved_date         112635 non-null  object 
 8   pickup_date                 111456 non-null  object 
 9   delivered_date              110196 non-null  object 
 10  estimated_time_delivery     112650 non-null  object 
 11  pickup_limit_date           112650 non-null  object 
 12  order_id_x                  112650 non-null  object 
 13  item_number   

In [96]:
merged = pd.merge(merged, payment, how="left", on="payment_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   payment_id_surr             112647 non-null  float64
 6   order_date                  112650 non-null  object 
 7   order_approved_date         112635 non-null  object 
 8   pickup_date                 111456 non-null  object 
 9   delivered_date              110196 non-null  object 
 10  estimated_time_delivery     112650 non-null  object 
 11  pickup_limit_date           112650 non-null  object 
 12  order_id_x                  112650 non-null  object 
 13  item_number   