# DBT Quality Checking
A notebook to check the result from transform layer with DBT


## Notebook Sanity Check

In [1]:
import numpy as np

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()

# OR, the same with increased verbosity
load_dotenv(verbose=True)

DBNAME= os.getenv("DBNAME")
HOSTNAME= os.getenv("HOSTNAME")
USER= os.getenv("USER")
PASS= os.getenv("PASS")
STGDBNAME = os.getenv("STAGING_DBNAME")

In [3]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600);

# Connect to PostgreSQL server
conn  = alchemyEngine.connect();

## Date Dimension

In [4]:
date = pd.read_sql_table("dim_date", conn, schema="warehouse")
date.head()

Unnamed: 0,date_id,date,day_name,day_of_week,day_of_month,day_of_quarter,day_of_year,week_of_month,week_of_year,month_actual,month_name,month_name_abbreviated,quarter,year,isweekend
0,20150101,2015-01-01,Thursday,4.0,1.0,1,1.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
1,20150102,2015-01-02,Friday,5.0,2.0,2,2.0,1,1.0,1.0,January,Jan,1.0,2015.0,False
2,20150103,2015-01-03,Saturday,6.0,3.0,3,3.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
3,20150104,2015-01-04,Sunday,7.0,4.0,4,4.0,1,1.0,1.0,January,Jan,1.0,2015.0,True
4,20150105,2015-01-05,Monday,1.0,5.0,5,5.0,1,2.0,1.0,January,Jan,1.0,2015.0,False


In [5]:
# Checking if all id is unique
date.date_id.is_unique

True

In [6]:
date.shape

(1801, 15)

In [7]:
# Checking date range
print("First date: " ,date.loc[0].date)
print("Last date: ", date.loc[date.shape[0] -1].date)

First date:  2015-01-01 00:00:00
Last date:  2019-12-06 00:00:00


## Geo Dimension

In [8]:
geo = pd.read_sql_table("dim_geo", conn, schema="warehouse")
geo.head()

Unnamed: 0,zip_code,city,state
0,99990,KABUPATEN MINAHASA UTARA,SULAWESI UTARA
1,99980,KOTA LUBUKLINGGAU,SUMATERA SELATAN
2,99970,KABUPATEN BOJONEGORO,JAWA TIMUR
3,99965,KABUPATEN ACEH BESAR,ACEH
4,99960,KOTA PEKANBARU,RIAU


In [9]:
groupby_state = geo.groupby(["state", "city"])["zip_code"].count()
groupby_state

state           city                     
ACEH            KABUPATEN ACEH BARAT         12
                KABUPATEN ACEH BARAT DAYA    12
                KABUPATEN ACEH BESAR         10
                KABUPATEN ACEH JAYA          10
                KABUPATEN ACEH SELATAN        7
                                             ..
SUMATERA UTARA  KOTA PADANGSIDIMPUAN          7
                KOTA PEMATANG SIANTAR        31
                KOTA SIBOLGA                 20
                KOTA TANJUNG BALAI            8
                KOTA TEBING TINGGI           22
Name: zip_code, Length: 514, dtype: int64

In [10]:
# check primary key uniqueness
geo.zip_code.is_unique

True

In [11]:
# Concern : Some zip code only consist of 4 digit
f = np.vectorize(len)
arr = f(geo.zip_code.values)
print("Number of zip code with less than 5 digit :",len(arr[arr < 5]))
print("Number of all address:", len(arr))

Number of zip code with less than 5 digit : 3906
Number of all address: 15070


## User Dimension

In [12]:
user = pd.read_sql_table("dim_user", conn, schema="warehouse")
user.head()

Unnamed: 0,user_name,num_order,total_spending,version,is_current_version
0,0000366f3b9a7992bf8c76cfdf3221e2,1,141900.0,1,True
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1,27190.0,1,True
2,0000f46a3911fa3c0805444483337064,1,86220.0,1,True
3,0000f6ccb0745a6a4b88665a16c9f078,1,43620.0,1,True
4,0004aac84e0df4da2b147fca70cf8255,1,196890.0,1,True


In [13]:
# With only 1 loading operation ever been done, all version should be one
username = user.loc[(user.version > 1)].user_name
user.loc[user.user_name.isin(username)]

Unnamed: 0,user_name,num_order,total_spending,version,is_current_version


In [14]:
# Check uniqueness
user.apply(
        lambda x: x['user_name'] + str(x['version']),
        axis = 1
).is_unique

True

## Product Dimension

In [15]:
product = pd.read_sql_table("dim_product", conn, schema="warehouse")
product.head()

Unnamed: 0,product_id,product_name_length,product_description_length,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,avg_product_feedback,version,is_current_version
0,00066f42aeeb9f3007548bb9d3f33c38,53.0,596.0,6.0,300.0,20.0,16.0,16.0,5.0,1,True
1,00088930e925c41fd95ebfe695fd2655,56.0,752.0,4.0,1225.0,55.0,10.0,26.0,4.0,1,True
2,0009406fd7479715e4bef61dd91f2462,50.0,266.0,2.0,300.0,45.0,15.0,35.0,1.0,1,True
3,000b8f95fcb9e0096488278317764d19,25.0,364.0,3.0,550.0,19.0,24.0,12.0,5.0,1,True
4,000d9be29b5207b54e86aa1b1ac54872,48.0,613.0,4.0,250.0,22.0,11.0,15.0,5.0,1,True


In [16]:
# Check version error
products_id = product.loc[(product.version > 1)].product_id
product.loc[product.product_id.isin(products_id)]

Unnamed: 0,product_id,product_name_length,product_description_length,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,avg_product_feedback,version,is_current_version


In [17]:
# Check product shape
print("Shape : ", product.shape)

Shape :  (32729, 11)


In [18]:
product.product_id.is_unique

True

## Seller Dimension

In [19]:
seller = pd.read_sql_table("dim_seller", conn, schema="warehouse")
seller.head()

Unnamed: 0,seller_id,seller_zip_code,num_order,total_revenue,version,is_current_version
0,0015a82c2db000af6aaaf3ae2ecb0532,9080,3,2685000.0,1,True
1,001cca7ae9ae17fb1caed9dfb1094831,29156,200,25080030.0,1,True
2,002100f778ceb8431b7a1020ff7ab48f,14405,51,1234500.0,1,True
3,003554e2dce176b5555353e4f3555ac8,74565,1,120000.0,1,True
4,004c9cd9d87a3c30c522c48c4fc07416,14940,158,19712710.0,1,True


In [20]:
# Check version error
sellers_id = seller.loc[(seller.version > 1)].seller_id
seller.loc[seller.seller_id.isin(sellers_id)]

Unnamed: 0,seller_id,seller_zip_code,num_order,total_revenue,version,is_current_version


In [21]:
# Check product shape
print("Shape : ", seller.shape)

Shape :  (3053, 6)


In [22]:
seller.seller_id.is_unique

True

## Payment Dimension

In [23]:
payment = pd.read_sql_table("dim_payment", conn, schema="warehouse")
payment.head()

Unnamed: 0,payment_id,payment_type,num_payment,total_payment_value,is_current_version
0,00010242fe8c5a6d1ba2dd792cb16214,credit_card,1,72190.0,True
1,00018f77f2f0320c557190d7a144bdd3,credit_card,1,259830.0,True
2,000229ec398224ef6ca0657da4fc703e,credit_card,1,216870.0,True
3,00024acbcdf0a6daa1e931b038114c75,credit_card,1,25780.0,True
4,00042b26cf59d7ce69dfabb4e55b4fd9,credit_card,1,218040.0,True


In [24]:
# Check version error -> because only one time entry, all version should be current version
payment_error = payment.loc[(payment.is_current_version == False)].payment_id
payment.loc[payment.payment_id.isin(payment_error)]

Unnamed: 0,payment_id,payment_type,num_payment,total_payment_value,is_current_version


In [25]:
# All payment_id == order_id (+ different method of payment), means all should be unique 
payment.apply(
        lambda x: x['payment_id'] +"-"+ str(x['payment_type']),
        axis = 1
).is_unique

True

## Feedback Dimension

In [26]:
feedback = pd.read_sql_table("dim_feedback", conn, schema="warehouse")
feedback.head()

Unnamed: 0,feedback_id,avg_feedback_score,feedback_form_sent_date,feedback_answer_date,version,is_current_version
0,00010242fe8c5a6d1ba2dd792cb16214,5.0,20170921,20170922,1,True
1,00018f77f2f0320c557190d7a144bdd3,4.0,20170513,20170515,1,True
2,000229ec398224ef6ca0657da4fc703e,5.0,20180123,20180123,1,True
3,00024acbcdf0a6daa1e931b038114c75,4.0,20180815,20180815,1,True
4,00042b26cf59d7ce69dfabb4e55b4fd9,5.0,20170302,20170303,1,True


In [27]:
# Check version error -> because only one time entry, all version should be current version
feedback_error = feedback.loc[(feedback.is_current_version == False)].feedback_id
feedback.loc[feedback.feedback_id.isin(feedback_error)]

Unnamed: 0,feedback_id,avg_feedback_score,feedback_form_sent_date,feedback_answer_date,version,is_current_version


In [28]:
feedback.feedback_id.is_unique

True

## Order Items Fact Table

In [29]:
fct = pd.read_sql_table("fct_order_item", conn, schema="warehouse")
fct.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,feedback_id,payment_id,user_name,order_item_status,pickup_limit_date,order_date,order_approved_date,pickup_date,delivered_date,estimated_time_delivery,price,shipping_cost
0,00061f2a7bc09da83e415a52dc8a4af1,1,d63c1011f49d98b976c352955b1c4bea,cc419e0650a3c5ba77189a1882b7556a,00061f2a7bc09da83e415a52dc8a4af1,00061f2a7bc09da83e415a52dc8a4af1,107e6259485efac66428a56f10801f4f,delivered,20180329,20180324,20180324.0,20180327.0,20180329.0,20180409,59990.0,8880.0
1,0009c9a17f916a706d71784483a5d643,1,3f27ac8e699df3d300ec4a5d8c5cf0b2,fcb5ace8bcc92f75707dc0f01a27d269,0009c9a17f916a706d71784483a5d643,0009c9a17f916a706d71784483a5d643,6062db572f3ef38b7a8ff4307abbfad3,delivered,20180502,20180425,20180425.0,20180427.0,20180430.0,20180509,639000.0,11340.0
2,000e906b789b55f64edcb1f84030f90d,1,57d79905de06d8897872c551bfd09358,ea8482cd71df3c1969d7b9473ff13abc,000e906b789b55f64edcb1f84030f90d,000e906b789b55f64edcb1f84030f90d,3588484a539617d91500764822230fb6,delivered,20171127,20171121,20171121.0,20171122.0,20171209.0,20171207,21990.0,11850.0
3,000f25f4d72195062c040b12dce9a18a,1,1c05e0964302b6cf68ca0d15f326c6ba,7c67e1448b00f6e969d365cea6b010ab,000f25f4d72195062c040b12dce9a18a,000f25f4d72195062c040b12dce9a18a,1a6cbc34ea404cb0af7ed74df0999354,delivered,20180321,20180307,20180307.0,20180316.0,20180322.0,20180411,119990.0,44400.0
4,0014ae671de39511f7575066200733b7,1,23365beed316535b4105bd800c46670e,92eb0f42c21942b6552362b9b114707d,0014ae671de39511f7575066200733b7,0014ae671de39511f7575066200733b7,8e5a8d9363eb6296154b65750c8702ca,delivered,20170529,20170522,20170523.0,20170529.0,20170607.0,20170613,16500.0,14100.0


In [30]:
fct.shape[0]

112650

In [31]:
# Check connection to the product_id

all_products_id = fct.product_id.unique()
dim_id = product.loc[product.is_current_version == True].product_id.unique()
check = [elem in dim_id  for elem in all_products_id]
if all(check):
    print("All product have reference to the product dimension")
else : 
    print("Not all product have reference to the product dimension")
    count = 0
    for elem in check:
        if not elem:
            count+=1
    print("Missing :", count)

Not all product have reference to the product dimension
Missing : 222


In [32]:
# Check connection to the product_id

all_sellers_id = fct.seller_id.unique()
dim_id = seller.loc[seller.is_current_version == True].seller_id.unique()    
check = [elem in dim_id  for elem in all_sellers_id]
if all(check):
    print("All seller have reference to the product dimension")
else : 
    print("Not all seller have reference to the product dimension")
    count = 0
    for elem in check:
        if not elem:
            count+=1
    print("Missing :", count)

Not all seller have reference to the product dimension
Missing : 42


In [None]:
# Check connection to the user who bought the item

all_username = fct.user_name.unique()
dim_id = user.loc[user.is_current_version == True].user_name.unique()    
check = [elem in dim_id  for elem in all_username]
if all(check):
    print("All user have reference to the product dimension")
else : 
    print("Not all user have reference to the product dimension")
    count = 0
    for elem in check:
        if not elem:
            count+=1
    print("Missing :", sum(count))

In [None]:
# Check connection to the payment who have reference using order id
all_order = fct.order_id.unique()
dim_id = payment.loc[payment.is_current_version == True].payment_id.unique()    
check = [elem in dim_id  for elem in all_order]
if all(check):
    print("All payment order id have reference to the product dimension")
else : 
    print("Not all payment order id have reference to the product dimension")
    count = 0
    for elem in check:
        if not elem:
            count+=1
    print("Missing :", sum(count))

In [None]:
# Check connection to the feedback using reference using order id
all_order = fct.order_id.unique()
dim_id = feedback.loc[feedback.is_current_version == True].feedback_id.unique()    
check = [elem in dim_id  for elem in all_order]
if all(check):
    print("All feedback order id have reference to the product dimension")
else : 
    print("Not all feedback order id have reference to the product dimension")
    count = 0
    for elem in check:
        if not elem:
            count+=1
    print("Missing :", sum(count))

In [None]:
# Making sure the primary key for the fact table
fct.apply(
        lambda x: x['order_id'] +"-"+ str(x['order_item_id']),
        axis = 1
).is_unique