# ETL Quality Checking
A notebook to check the result from ETL query process

Scripts :
* dim_populate.sql
* fct_populate.sql

## Notebook Sanity Check

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

# OR, the same with increased verbosity
load_dotenv(verbose=True)

DBNAME= os.getenv("DBNAME")
HOSTNAME= os.getenv("HOSTNAME")
USER= os.getenv("USER")
PASS= os.getenv("PASS")
STGDBNAME = os.getenv("STAGING_DBNAME")

In [2]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600);

# Connect to PostgreSQL server
conn  = alchemyEngine.connect();

In [3]:
# Showing Missing Data
def show_missing_data(df):
    print(f"Shape : {df.shape}")
    print(f"Missing Data : {df.isnull().sum()}")
    return None

In [4]:
# Global var
schema = "staging"

## Date dimension

In [5]:
date = pd.read_sql_table("dim_date", conn, schema=schema)
date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3651 entries, 0 to 3650
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_id                 3651 non-null   object        
 1   date                    3651 non-null   datetime64[ns]
 2   day_name                3651 non-null   object        
 3   day_of_week             3651 non-null   int64         
 4   day_of_month            3651 non-null   int64         
 5   day_of_quarter          3651 non-null   int64         
 6   day_of_year             3651 non-null   float64       
 7   week_of_month           3651 non-null   int64         
 8   week_of_year            3651 non-null   float64       
 9   month_actual            3651 non-null   float64       
 10  month_name              3651 non-null   object        
 11  month_name_abbreviated  3651 non-null   object        
 12  quarter                 3651 non-null   float64 

In [6]:
show_missing_data(date)

Shape : (3651, 15)
Missing Data : date_id                   0
date                      0
day_name                  0
day_of_week               0
day_of_month              0
day_of_quarter            0
day_of_year               0
week_of_month             0
week_of_year              0
month_actual              0
month_name                0
month_name_abbreviated    0
quarter                   0
year                      0
isWeekend                 0
dtype: int64


In [7]:
date.head(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
0,20150101,2015-01-01,Thursday,4,1,1,1.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
1,20150102,2015-01-02,Friday,5,2,2,2.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
2,20150103,2015-01-03,Saturday,6,3,3,3.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
3,20150104,2015-01-04,Sunday,7,4,4,4.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
4,20150105,2015-01-05,Monday,1,5,5,5.0,1,2.0,1.0,January,Jan,1.0,2015.0,False


In [8]:
date.tail(5)

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isWeekend
3646,20241225,2024-12-25,Wednesday,3,25,86,360.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3647,20241226,2024-12-26,Thursday,4,26,87,361.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3648,20241227,2024-12-27,Friday,5,27,88,362.0,4,52.0,12.0,December,Dec,4.0,2024.0,False
3649,20241228,2024-12-28,Saturday,6,28,89,363.0,4,52.0,12.0,December,Dec,4.0,2024.0,True
3650,20241229,2024-12-29,Sunday,7,29,90,364.0,5,52.0,12.0,December,Dec,4.0,2024.0,True


## User dimension

In [9]:
user = pd.read_sql_table("dim_user", conn, schema=schema)
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96096 entries, 0 to 96095
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             96096 non-null  int64  
 1   user_name           96096 non-null  object 
 2   total_order         96096 non-null  float64
 3   total_spending      96096 non-null  float64
 4   is_current_version  96096 non-null  bool   
dtypes: bool(1), float64(2), int64(1), object(1)
memory usage: 3.0+ MB


In [10]:
show_missing_data(user)

# Notes : There are a few user (1113 to be exact) that haven't done any transaction in the span of the data being collected

Shape : (96096, 5)
Missing Data : user_id               0
user_name             0
total_order           0
total_spending        0
is_current_version    0
dtype: int64


In [11]:
user.head()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
0,96097,00053a61a98854899e70ed204dd4bafe,1.0,419180.0,True
1,96098,0005e1862207bf6ccc02e4228effd9a0,1.0,150120.0,True
2,96099,00090324bbad0e9342388303bb71ba0a,1.0,63660.0,True
3,96100,000bfa1d2f1a41876493be685390d6d3,1.0,46850.0,True
4,96101,000c8bdb58a29e7115cfc257230fb21b,1.0,29000.0,True


In [12]:
user.tail()

Unnamed: 0,user_id,user_name,total_order,total_spending,is_current_version
96091,192188,1b32669eb9662ee904419de883e59a58,1.0,0.0,True
96092,192189,5942dde582a33e31ea4471bc5363b0f3,1.0,0.0,True
96093,192190,daba2e7a00c149161c68cbb18db656a9,1.0,0.0,True
96094,192191,7d373e92dd3086b4c37e9868fc8999c1,1.0,0.0,True
96095,192192,c1653f4d5fcfb808bee93bb3e1aa4744,1.0,0.0,True


## Product dimension

In [13]:
product = pd.read_sql_table("dim_product", conn, schema=schema)
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id_surr             32951 non-null  int64  
 1   product_id                  32951 non-null  object 
 2   product_category            32328 non-null  object 
 3   product_name_length         32341 non-null  float64
 4   product_description_length  32341 non-null  float64
 5   product_photos_qty          32341 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_weight_g            32949 non-null  float64
 8   product_height_cm           32949 non-null  float64
 9   product_width_cm            32949 non-null  float64
 10  is_current_version          32951 non-null  bool   
dtypes: bool(1), float64(7), int64(1), object(2)
memory usage: 2.5+ MB


In [14]:
# There are some missing values from the data
show_missing_data(product)

Shape : (32951, 11)
Missing Data : product_id_surr                 0
product_id                      0
product_category              623
product_name_length           610
product_description_length    610
product_photos_qty            610
product_length_cm               2
product_weight_g                2
product_height_cm               2
product_width_cm                2
is_current_version              0
dtype: int64


In [15]:
# Notes : All data in 3 columns (product_name_length, product_description_length, product_photos_qty) product dimension that were missing are same 
product.loc[product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,33057,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,33080,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,33097,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,33106,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,33149,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,65467,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,65541,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,65568,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,65724,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


In [16]:
# Notes : All data in 3 columns (product_length_cm, product_weight_g, product_width_cm) product dimension that were missing are same 
product.loc[product.product_length_cm.isna() & product.product_weight_g.isna() & product.product_width_cm.isna()]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
8595,41536,09ff539a621711667c43eba6a3bd8466,baby,60.0,865.0,3.0,,,,,True
18857,51806,5eb564652db742ff8f28759cd8d2652a,,,,,,,,,True


In [17]:
# Notes : All data with missing category have missing previous detected missing cols -> might create new cols for understanding business problem
product.loc[product.product_category.isna() | (product.product_name_length.isna() & product.product_description_length.isna() & product.product_photos_qty.isna())]

Unnamed: 0,product_id_surr,product_id,product_category,product_name_length,product_description_length,product_photos_qty,product_length_cm,product_weight_g,product_height_cm,product_width_cm,is_current_version
105,33057,a41e356c76fab66334f36de622ecbd3a,,,,,17.0,650.0,14.0,12.0,True
128,33080,d8dee61c2034d6d075997acef1870e9b,,,,,16.0,300.0,7.0,20.0,True
145,33097,56139431d72cd51f19eb9f7dae4d1617,,,,,20.0,200.0,20.0,20.0,True
154,33106,46b48281eb6d663ced748f324108c733,,,,,41.0,18500.0,30.0,41.0,True
197,33149,5fb61f482620cb672f5e586bb132eae9,,,,,35.0,300.0,7.0,12.0,True
...,...,...,...,...,...,...,...,...,...,...,...
32515,65467,b0a0c5dd78e644373b199380612c350a,,,,,30.0,1800.0,20.0,70.0,True
32589,65541,10dbe0fbaa2c505123c17fdc34a63c56,,,,,30.0,800.0,10.0,23.0,True
32616,65568,bd2ada37b58ae94cc838b9c0569fecd8,,,,,21.0,200.0,8.0,16.0,True
32772,65724,fa51e914046aab32764c41356b9d4ea4,,,,,45.0,1300.0,16.0,45.0,True


## Seller dimension

In [18]:
seller = pd.read_sql_table("dim_seller", conn, schema=schema)
seller.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   seller_id_surr      3095 non-null   int64 
 1   seller_id           3095 non-null   object
 2   seller_zip_code     3095 non-null   int64 
 3   seller_city         3095 non-null   object
 4   seller_state        3095 non-null   object
 5   is_current_version  3095 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 124.0+ KB


In [19]:
show_missing_data(seller)

Shape : (3095, 6)
Missing Data : seller_id_surr        0
seller_id             0
seller_zip_code       0
seller_city           0
seller_state          0
is_current_version    0
dtype: int64


In [20]:
# TODO : Add num_product_order & total_revenue in data mart for seller subject -> combine with fct_order_items
seller.head(5)

Unnamed: 0,seller_id_surr,seller_id,seller_zip_code,seller_city,seller_state,is_current_version
0,3096,3442f8959a84dea7ee197c632cb2df15,13023,KOTA JAKARTA TIMUR,DKI JAKARTA,True
1,3097,d1b65fc7debc3361ea86b5f14c68d2e2,13844,KOTA PADANG PANJANG,SUMATERA BARAT,True
2,3098,ce3ad9de960102d0677a81f5d0bb7b2d,20031,KOTA JAKARTA BARAT,DKI JAKARTA,True
3,3099,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,KOTA TANGERANG,BANTEN,True
4,3100,51a04a8a6bdcb23deccc82b0b80742cf,12914,KABUPATEN LAMONGAN,JAWA TIMUR,True


## Feedback dimension

In [21]:
feedback = pd.read_sql_table("dim_feedback", conn, schema=schema)
feedback.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   feedback_id_surr         99441 non-null  int64  
 1   order_id                 99441 non-null  object 
 2   feedback_avg_score       99441 non-null  float64
 3   feedback_form_sent_date  99441 non-null  object 
 4   feedback_answer_date     99441 non-null  object 
 5   is_current_version       99441 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 3.9+ MB


In [22]:
# Notes : All feedback in the dimension have been measured based on each order (under asumption that an order should have a single feedback, and multiple feedback means multiple sent form for a single order -> aggregated)

show_missing_data(feedback)

Shape : (99441, 6)
Missing Data : feedback_id_surr           0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version         0
dtype: int64


In [23]:
feedback.head()

Unnamed: 0,feedback_id_surr,order_id,feedback_avg_score,feedback_form_sent_date,feedback_answer_date,is_current_version
0,198883,00010242fe8c5a6d1ba2dd792cb16214,5.0,20170921,20170922,True
1,198884,00018f77f2f0320c557190d7a144bdd3,4.0,20170513,20170515,True
2,198885,000229ec398224ef6ca0657da4fc703e,5.0,20180123,20180123,True
3,198886,00024acbcdf0a6daa1e931b038114c75,4.0,20180815,20180815,True
4,198887,00042b26cf59d7ce69dfabb4e55b4fd9,5.0,20170302,20170303,True


## Payment Facts
---
Granularity : Each payment (payment installment / cicilan) that user pay after transaction  

In [24]:
fct_payment = pd.read_sql_table("fct_payment", conn, schema=schema)
fct_payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    103886 non-null  int64  
 1   feedback_id_surr      103886 non-null  int64  
 2   user_id               103886 non-null  int64  
 3   payment_sequential    103886 non-null  int64  
 4   payment_type          103886 non-null  object 
 5   payment_installments  103886 non-null  int64  
 6   payment_value         103886 non-null  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 5.5+ MB


In [25]:
show_missing_data(fct_payment)

Shape : (103886, 7)
Missing Data : id                      0
feedback_id_surr        0
user_id                 0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64


### Check Reference

In [26]:
# Check connection to the feedback table
merged = pd.merge(fct_payment, feedback, how="inner", on="feedback_id_surr")
merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 103886 entries, 0 to 103885
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       103886 non-null  int64  
 1   feedback_id_surr         103886 non-null  int64  
 2   user_id                  103886 non-null  int64  
 3   payment_sequential       103886 non-null  int64  
 4   payment_type             103886 non-null  object 
 5   payment_installments     103886 non-null  int64  
 6   payment_value            103886 non-null  float64
 7   order_id                 103886 non-null  object 
 8   feedback_avg_score       103886 non-null  float64
 9   feedback_form_sent_date  103886 non-null  object 
 10  feedback_answer_date     103886 non-null  object 
 11  is_current_version       103886 non-null  bool   
dtypes: bool(1), float64(2), int64(5), object(4)
memory usage: 9.6+ MB


In [27]:
show_missing_data(merged)

Shape : (103886, 12)
Missing Data : id                         0
feedback_id_surr           0
user_id                    0
payment_sequential         0
payment_type               0
payment_installments       0
payment_value              0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version         0
dtype: int64


In [28]:
merged = pd.merge(merged, user, how="inner", on="user_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103886 entries, 0 to 103885
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       103886 non-null  int64  
 1   feedback_id_surr         103886 non-null  int64  
 2   user_id                  103886 non-null  int64  
 3   payment_sequential       103886 non-null  int64  
 4   payment_type             103886 non-null  object 
 5   payment_installments     103886 non-null  int64  
 6   payment_value            103886 non-null  float64
 7   order_id                 103886 non-null  object 
 8   feedback_avg_score       103886 non-null  float64
 9   feedback_form_sent_date  103886 non-null  object 
 10  feedback_answer_date     103886 non-null  object 
 11  is_current_version_x     103886 non-null  bool   
 12  user_name                103886 non-null  object 
 13  total_order              103886 non-null  float64
 14  tota

In [29]:
show_missing_data(merged)
# missing total_spending due to the problem 

Shape : (103886, 16)
Missing Data : id                         0
feedback_id_surr           0
user_id                    0
payment_sequential         0
payment_type               0
payment_installments       0
payment_value              0
order_id                   0
feedback_avg_score         0
feedback_form_sent_date    0
feedback_answer_date       0
is_current_version_x       0
user_name                  0
total_order                0
total_spending             0
is_current_version_y       0
dtype: int64


## Order items (Transaction) Facts

In [31]:
fct_order_item = pd.read_sql_table("fct_order_items", conn, schema=schema)
fct_order_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id                 112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [33]:
show_missing_data(fct_order_item)

Shape : (112650, 16)
Missing Data : id                            0
user_id                       0
product_id_surr               0
seller_id_surr                0
feedback_id_surr              0
order_date                    0
order_approved_date          15
pickup_date                1194
delivered_date             2454
estimated_time_delivery       0
pickup_limit_date             0
order_id                      0
item_number                   0
order_item_status             0
price                         0
shipping_cost                 0
dtype: int64


In [34]:
# Why there are missing order_approved_date ?? -> chance is, order haven't approved by the user when this data is recorded -> can see that order id is mostly 2017 02 -> what happen?
fct_order_item.loc[fct_order_item.order_approved_date.isna()]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
4806,4807,119072,39191,5303,215699,20170218,,20170222,20170303,20170331,20170222,2babbb4b15e6d2dfe95e2de765c97bce,1,delivered,79990.0,26820.0
6670,6671,161961,41737,3434,222195,20170217,,20170222,20170303,20170323,20170222,3c0b8706b065f9919d0505d3b3343881,1,delivered,133990.0,23200.0
9079,9080,179536,64367,5203,230743,20170218,,20170223,20170307,20170329,20170228,51eb2eebd5d76a24625b31c33dd41449,1,delivered,59900.0,17160.0
30223,30224,162697,39191,5303,206163,20170217,,20170222,20170302,20170320,20170221,12a95a3c06dbaec84bcfb0e2da5d228a,1,delivered,79990.0,15770.0
61406,61407,104507,49790,3295,216998,20170217,,20170222,20170303,20170320,20170221,2eecb0d85f281280f79fa00f9cec1a95,1,delivered,135000.0,19230.0
68583,68584,187965,48207,5203,242408,20170119,,20170127,20170206,20170316,20170129,7002a78c79c519ac54022d4f8a65e6e8,1,delivered,45900.0,14520.0
71458,71459,157567,48310,5094,252534,20170218,,20170223,20170302,20170321,20170222,8a9adc69528e1001fc68dd0aaebbb54a,1,delivered,379000.0,17860.0
77618,77619,178641,45567,5408,274157,20170119,,20170125,20170130,20170301,20170123,c1d4211b3dae76144deccd6c74144a88,1,delivered,39990.0,14520.0
79951,79952,124346,51208,5203,282356,20170219,,20170223,20170302,20170327,20170226,d69e5d356402adc8cf17e08b5033acfb,1,delivered,149800.0,13630.0
81006,81007,181115,53718,3179,286138,20170218,,20170223,20170301,20170317,20170222,e04abd8149ef81b95221e88f6ed9ab6a,1,delivered,309900.0,39110.0


In [37]:
fct_order_item.sort_values(by="order_date", ascending=False).head()

# Conc : missing data cause is not latest data recorded ? -> Then what? Need more exploration

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
9312,9313,167827,54696,5725,231582,20180903,20180903,20180904,,20180906,20180905,54282e97f61c23b78330c15b154c867d,1,shipped,145000.0,21460.0
79996,79997,99169,40110,5903,282513,20180829,20180829,20180829,20180830.0,20180904,20180903,d70442bc5e3cb7438da497cc6a210f80,1,delivered,6900.0,7390.0
426,427,137147,43512,3507,200416,20180829,20180829,20180829,20180830.0,20180903,20180831,03ef5dedbe7492bdae72eec50764c43f,1,delivered,24900.0,8330.0
55734,55735,136945,51636,3893,296465,20180829,20180829,20180829,20180830.0,20180904,20180831,fb393211459aac00af932cd7ab4fa2cc,1,delivered,99000.0,7950.0
22017,22018,143057,37810,5975,276711,20180829,20180829,20180829,20180830.0,20180903,20180831,c84d88553f9878bf2c7ecda2eb211ece,1,delivered,65000.0,9210.0


In [38]:
# Why is pickup_date is missing ? -> Is it because of order status? 
fct_order_item.loc[fct_order_item.pickup_date.isna()].head()

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
104,105,113317,50814,5873,199263,20170410,20170410,,,20170504,20170414,00ff0cf5583758e6964723e42f111bf4,1,canceled,154900.0,15250.0
143,144,127154,61601,3912,199405,20170131,20170202,,,20170309,20170204,015fb6b5f739788434fa690540f90f19,1,invoiced,143000.0,94980.0
144,145,127154,61601,3912,199405,20170131,20170202,,,20170309,20170204,015fb6b5f739788434fa690540f90f19,2,invoiced,143000.0,94980.0
278,279,100074,41471,5981,199894,20161009,20161010,,,20161212,20161014,02a0eb7c22b0616c767a45954a2a28f6,1,invoiced,119500.0,25040.0
360,361,156381,40615,4625,200186,20180325,20180325,,,20180406,20180329,035b790fa740b68de2d6f1a74f9b2098,1,invoiced,63990.0,8290.0


In [41]:
fct_order_item.loc[fct_order_item.pickup_date.isna() & (~fct_order_item.delivered_date.isna())]

# Weird -> how can the product is being delivered, but never being pickup? -> most of the time, the product need to be picked up by the delivery service -> then deliverd

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
4684,4685,135215,48860,5470,215308,20170929,20170929,,20171120,20171114,20171018,2aa91108853cecb43c84a5dc5b277475,1,delivered,179000.0,14980.0


In [43]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna())]
# There are many case where the product is being pickup but not delivered ?? -> Why though

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
62,63,105072,50647,5757,199119,20180817,20180817,20180818,,20180918,20180821,00a99c50fdff7e36262caba33821875a,1,shipped,52990.0,22300.0
303,304,130500,52443,5855,200001,20170323,20170323,20170328,,20170418,20170329,02ea547b6d2ee25305588fd50df58b46,1,shipped,23990.0,15650.0
305,306,128038,41776,4841,200006,20170428,20170428,20170511,,20170524,20170505,02ec4da9d03014f06d711d60eb37cc22,1,shipped,85000.0,20080.0
307,308,139498,40800,4780,200015,20171127,20171128,20171129,,20171211,20171204,02f30be57375dc610c40ca989d2385b7,1,shipped,69000.0,11730.0
323,324,160991,59900,5904,200060,20180615,20180616,20180618,,20180713,20180620,03138c298bfe1ee7855eefc9442346a8,1,shipped,40000.0,19470.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112328,112329,174396,41248,5842,297136,20180329,20180329,20180402,,20180507,20180404,fceed72ef6fde5b43c8bcefbdb223edc,1,shipped,139000.0,54720.0
112459,112460,175147,38776,4509,297679,20171126,20171126,20171130,,20171219,20171204,fe4979a6778e2fa4ae731cbe532d30e0,1,shipped,89990.0,15390.0
112474,112475,155953,38108,4533,297741,20180328,20180328,20180328,,20180419,20180403,fe68d8f20f2d6e9d702a74d3bfa9c4fa,1,shipped,199000.0,19270.0
112531,112532,127630,54972,5000,297935,20170925,20170925,20170927,,20171024,20170929,feebdfdb4759bd4600d52547c81b4fa8,1,shipped,24900.0,21150.0


In [45]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "shipped")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
62,63,105072,50647,5757,199119,20180817,20180817,20180818,,20180918,20180821,00a99c50fdff7e36262caba33821875a,1,shipped,52990.0,22300.0
303,304,130500,52443,5855,200001,20170323,20170323,20170328,,20170418,20170329,02ea547b6d2ee25305588fd50df58b46,1,shipped,23990.0,15650.0
305,306,128038,41776,4841,200006,20170428,20170428,20170511,,20170524,20170505,02ec4da9d03014f06d711d60eb37cc22,1,shipped,85000.0,20080.0
307,308,139498,40800,4780,200015,20171127,20171128,20171129,,20171211,20171204,02f30be57375dc610c40ca989d2385b7,1,shipped,69000.0,11730.0
323,324,160991,59900,5904,200060,20180615,20180616,20180618,,20180713,20180620,03138c298bfe1ee7855eefc9442346a8,1,shipped,40000.0,19470.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112328,112329,174396,41248,5842,297136,20180329,20180329,20180402,,20180507,20180404,fceed72ef6fde5b43c8bcefbdb223edc,1,shipped,139000.0,54720.0
112459,112460,175147,38776,4509,297679,20171126,20171126,20171130,,20171219,20171204,fe4979a6778e2fa4ae731cbe532d30e0,1,shipped,89990.0,15390.0
112474,112475,155953,38108,4533,297741,20180328,20180328,20180328,,20180419,20180403,fe68d8f20f2d6e9d702a74d3bfa9c4fa,1,shipped,199000.0,19270.0
112531,112532,127630,54972,5000,297935,20170925,20170925,20170927,,20171024,20170929,feebdfdb4759bd4600d52547c81b4fa8,1,shipped,24900.0,21150.0


In [47]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status == "canceled")]

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
2284,2285,192162,36208,3893,206866,20180212,20180212,20180214,,20180313,20180216,14904245b311483b6886be3c2e717e76,1,canceled,55000.0,17670.0
3327,3328,167908,42404,5474,210515,20171221,20171223,20180130,,20180220,20180129,1def7af24cb7182497acfd953f6dd88b,1,canceled,244690.0,54310.0
4848,4849,168130,56446,6006,215814,20180206,20180206,20180216,,20180312,20180212,2c0032906c603dcc21cd954828fa77d0,1,canceled,213750.0,36820.0
4945,4946,144115,37462,4228,216207,20180219,20180219,20180220,,20180315,20180223,2cfc79d9582e9135c0a9b61fa60e6b21,1,canceled,59900.0,19660.0
5031,5032,168110,36597,3486,216512,20180202,20180202,20180206,,20180220,20180208,2db84a082bda455eb1da10727e7ca621,1,canceled,12000.0,7780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105631,105632,168123,60838,4644,273032,20180123,20180123,20180124,,20180222,20180129,bef875448d5a4f703555d2dd5f8382b5,1,canceled,139900.0,19220.0
107032,107033,119911,34445,4138,277881,20180130,20180130,20180201,,20180222,20180205,cb599c234fc71be7c137179b6d473a30,1,canceled,35900.0,15100.0
107973,107974,175112,57880,5655,281277,20180205,20180205,20180206,,20180221,20180211,d3afe40f7060b3287d948a6bdea42687,1,canceled,26000.0,7780.0
108785,108786,192145,34192,3568,284303,20180124,20180124,20180126,,20180305,20180130,dba741d70126a95feb85435175370edc,1,canceled,69900.0,16930.0


In [48]:
fct_order_item.loc[~fct_order_item.pickup_date.isna() & (fct_order_item.delivered_date.isna()) & (fct_order_item.order_item_status != "canceled") & (fct_order_item.order_item_status != "shipped")]

# Delivered but never recorded in delivered date -> might use the estimated_time_delivery (remaining 7 records)

Unnamed: 0,id,user_id,product_id_surr,seller_id_surr,feedback_id_surr,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,pickup_limit_date,order_id,item_number,order_item_status,price,shipping_cost
18774,18775,139153,34723,3234,265289,20180608,20180608,20180612,,20180626,20180618,ab7c89dc1bf4a1ead9d6ec1ec8968a84,1,delivered,110990.0,9130.0
25376,25377,149683,35917,6056,288587,20180701,20180701,20180703,,20180730,20180705,e69f75a717d64fc5ecdfae42b2e8e086,1,delivered,139000.0,19070.0
33036,33037,121902,62606,4780,216254,20171128,20171128,20171130,,20171218,20171204,2d1e2d5bf4dc7227b3bfebb81328c15f,1,delivered,117300.0,17530.0
33222,33223,122713,35917,6056,216924,20180701,20180701,20180703,,20180730,20180705,2ebdfc4f15f23b91474edf87475f108e,1,delivered,139000.0,19070.0
55147,55148,100477,53187,5478,294373,20180620,20180620,20180625,,20180716,20180626,f5dd62b788049ad9fc0526e3ad11a097,1,delivered,329000.0,25240.0
57735,57736,117970,58417,5177,204051,20180701,20180701,20180703,,20180724,20180705,0d3268bad9b086af767785e3f0fc0133,1,delivered,188990.0,15630.0
88187,88188,146229,63530,4904,211657,20180627,20180627,20180703,,20180719,20180703,20edc82cf5400ce95e1afacc25798b31,1,delivered,45900.0,9070.0


### Check reference

In [49]:
merged = pd.merge(fct_order_item, user, how="inner", on="user_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id                 112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [50]:
merged = pd.merge(merged,feedback, how="inner", on="feedback_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       112650 non-null  int64  
 1   user_id                  112650 non-null  int64  
 2   product_id_surr          112650 non-null  int64  
 3   seller_id_surr           112650 non-null  int64  
 4   feedback_id_surr         112650 non-null  int64  
 5   order_date               112650 non-null  object 
 6   order_approved_date      112635 non-null  object 
 7   pickup_date              111456 non-null  object 
 8   delivered_date           110196 non-null  object 
 9   estimated_time_delivery  112650 non-null  object 
 10  pickup_limit_date        112650 non-null  object 
 11  order_id_x               112650 non-null  object 
 12  item_number              112650 non-null  int64  
 13  order_item_status        112650 non-null  object 
 14  pric

In [51]:
merged = pd.merge(merged,product, how="inner", on="product_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          112650 non-null  int64  
 1   user_id                     112650 non-null  int64  
 2   product_id_surr             112650 non-null  int64  
 3   seller_id_surr              112650 non-null  int64  
 4   feedback_id_surr            112650 non-null  int64  
 5   order_date                  112650 non-null  object 
 6   order_approved_date         112635 non-null  object 
 7   pickup_date                 111456 non-null  object 
 8   delivered_date              110196 non-null  object 
 9   estimated_time_delivery     112650 non-null  object 
 10  pickup_limit_date           112650 non-null  object 
 11  order_id_x                  112650 non-null  object 
 12  item_number                 112650 non-null  int64  
 13  order_item_sta

In [53]:
merged = pd.merge(merged,seller, how="inner", on="seller_id_surr")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 50 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            112650 non-null  int64  
 1   user_id                       112650 non-null  int64  
 2   product_id_surr               112650 non-null  int64  
 3   seller_id_surr                112650 non-null  int64  
 4   feedback_id_surr              112650 non-null  int64  
 5   order_date                    112650 non-null  object 
 6   order_approved_date           112635 non-null  object 
 7   pickup_date                   111456 non-null  object 
 8   delivered_date                110196 non-null  object 
 9   estimated_time_delivery       112650 non-null  object 
 10  pickup_limit_date             112650 non-null  object 
 11  order_id_x                    112650 non-null  object 
 12  item_number                   112650 non-nul