## Table List in Dataset

<img src="images/field_list_sample_snowflake.png">

## Import Library

In [1]:
# Import Library
import pandas as pd
import os

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format

## Read Data

In [2]:
# Read raw data from csv
data_dir = "data"# Add path to folder contained dataset file
customer_df = pd.read_csv(f"{data_dir}/CUSTOMER.csv") 
lineitem_df = pd.read_csv(f"{data_dir}/LINEITEM.csv")
nation_df = pd.read_csv(f"{data_dir}/NATION.csv")
order_df = pd.read_csv(f"{data_dir}/ORDER.csv")
part_df = pd.read_csv(f"{data_dir}/PART.csv")
partsupp_df = pd.read_csv(f"{data_dir}/PARTSUPP.csv")
region_df = pd.read_csv(f"{data_dir}/REGION.csv")
supplier_df = pd.read_csv(f"{data_dir}/SUPPLIER.csv")

## Data Exploration

#### Lineitem

`DataFrame.info()`
> A method to prints a summary of the DataFrame including:
> - Indexes
> - Columns
> - Non-null values
> - Data types 
> - Memory usage

In [3]:
# Display a summary info of Lineitem.
lineitem_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1101003 entries, 0 to 1101002
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   L_ORDERKEY       1101003 non-null  int64  
 1   L_PARTKEY        1101003 non-null  int64  
 2   L_SUPPKEY        1101003 non-null  int64  
 3   L_LINENUMBER     1040966 non-null  float64
 4   L_QUANTITY       1041041 non-null  float64
 5   L_EXTENDEDPRICE  1101003 non-null  float64
 6   L_DISCOUNT       1101003 non-null  float64
 7   L_TAX            1101003 non-null  float64
 8   L_RETURNFLAG     1101003 non-null  object 
 9   L_LINESTATUS     1040971 non-null  object 
 10  L_SHIPDATE       1101003 non-null  object 
 11  L_COMMITDATE     1101003 non-null  object 
 12  L_RECEIPTDATE    1101003 non-null  object 
 13  L_SHIPINSTRUCT   1101003 non-null  object 
 14  L_SHIPMODE       1041013 non-null  object 
 15  L_COMMENT        0 non-null        float64
dtypes: float64(6), int

`DataFrame.describe()`
> A method to display descriptive statistics (such as `count`, `mean`, `std`, `min`, `max`, `percentiles` for numeric data) of the DataFrame.

`DataFrame.apply(func, axis=0)` 
> A method to apply a function along an axis (either `axis=0`: DataFrame’s index (default) or `axis=1`: the DataFrame’s columns) of the DataFrame.

In [4]:
# Display the descriptive statistics of Lineitem
lineitem_df.describe()

Unnamed: 0,L_ORDERKEY,L_PARTKEY,L_SUPPKEY,L_LINENUMBER,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT,L_TAX,L_COMMENT
count,1101003.0,1101003.0,1101003.0,1040966.0,1041041.0,1101003.0,1101003.0,1101003.0,0.0
mean,2409702.13,100026.85,5004.23,2.85,24.24,38274.48,0.05,0.04,
std,1750970.18,57750.0,2888.58,1.92,16.03,23292.35,0.03,0.03,
min,600001.0,1.0,1.0,-7.0,-50.0,905.0,0.0,0.0,
25%,862338.0,50024.5,2503.0,1.0,12.0,18785.25,0.02,0.02,
50%,1124898.0,99939.0,5006.0,3.0,25.0,36718.08,0.05,0.04,
75%,4386467.0,150109.0,7508.0,4.0,38.0,55201.0,0.08,0.06,
max,4648738.0,200000.0,10000.0,7.7,54.92,104899.5,0.1,0.08,


`DataFrame.head(n=5)`
> A method to return the first n rows (default=5) for the object based on position.

In [5]:
# Display sample of Lineitem.
lineitem_df.head()

Unnamed: 0,L_ORDERKEY,L_PARTKEY,L_SUPPKEY,L_LINENUMBER,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT,L_TAX,L_RETURNFLAG,L_LINESTATUS,L_SHIPDATE,L_COMMITDATE,L_RECEIPTDATE,L_SHIPINSTRUCT,L_SHIPMODE,L_COMMENT
0,600001,112520,7543,1.0,27.0,41378.04,0.1,0.0,N,O,1995-11-28,1995-12-06,1995-12-06,TAKE BACK RETURN,AIR,
1,600001,114063,1597,2.0,8.0,8616.48,0.06,0.0,N,O,1996-02-10,1995-12-13,1996-02-29,NONE,FOB,
2,600001,46739,4252,3.0,20.0,33714.6,0.02,0.05,N,O,1995-12-09,1996-01-10,1995-12-16,TAKE BACK RETURN,REG AIR,
3,600002,79540,7062,1.0,30.0,45586.2,0.09,0.01,R,F,1992-09-20,1992-09-29,1992-10-15,NONE,AIR,
4,600002,24776,4777,2.0,8.0,13606.16,0.01,0.05,A,F,1992-10-01,1992-09-01,1992-10-05,COLLECT COD,AIR,


#### Order

In [6]:
# Display a summary info of Order.
order_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1101003 entries, 0 to 1101002
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   O_ORDERKEY       1101003 non-null  int64  
 1   O_CUSTKEY        1101003 non-null  int64  
 2   O_ORDERSTATUS    1041071 non-null  object 
 3   O_TOTALPRICE     1101003 non-null  float64
 4   O_ORDERDATE      1040885 non-null  object 
 5   O_ORDERPRIORITY  1040964 non-null  object 
 6   O_CLERK          1101003 non-null  object 
 7   O_SHIPPRIORITY   1101003 non-null  int64  
 8   O_COMMENT        1101003 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 75.6+ MB


In [7]:
# Display the descriptive statistics of Order
order_df.describe()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_TOTALPRICE,O_SHIPPRIORITY
count,1101003.0,1101003.0,1101003.0,1101003.0
mean,3386487.56,75061.23,151246.98,0.0
std,1350322.8,43310.85,88633.1,0.0
min,1200001.0,1.0,857.71,0.0
25%,2248385.5,37558.0,77939.32,0.0
50%,3302247.0,75088.0,144423.4,0.0
75%,4350945.5,112609.0,215546.05,0.0
max,6000000.0,149999.0,555285.16,0.0


In [8]:
# Display sample of Order
order_df.head()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
0,4200001,13726,F,99406.41,1994-02-21,,Clerk#000000128,0,comment
1,4200002,129376,O,256838.41,1997-04-14,4-NOT SPECIFIED,Clerk#000000281,0,comment
2,4200003,141613,O,150849.49,1997-11-24,4-NOT SPECIFIED,Clerk#000000585,0,comment
3,4200004,23515,O,178688.27,1996-12-09,,Clerk#000000632,0,comment
4,4200005,97687,O,261742.31,,2-HIGH,Clerk#000000562,0,comment


#### Customer & Supplier

In [9]:
# Explore data for other table
print("Customer")
customer_df.info()

print("\nSupplier")
supplier_df.info()

Customer
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   C_CUSTKEY     150000 non-null  int64  
 1   C_NAME        150000 non-null  object 
 2   C_ADDRESS     150000 non-null  object 
 3   C_NATIONKEY   150000 non-null  int64  
 4   C_PHONE       150000 non-null  object 
 5   C_ACCTBAL     150000 non-null  float64
 6   C_MKTSEGMENT  150000 non-null  object 
 7   C_COMMENT     0 non-null       float64
dtypes: float64(2), int64(2), object(4)
memory usage: 9.2+ MB

Supplier
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   S_SUPPKEY    10000 non-null  int64  
 1   S_NAME       10000 non-null  object 
 2   S_ADDRESS    10000 non-null  object 
 3   S_NATIONKEY  10000 non-null  int64  
 4   S_PHO

In [10]:
customer_df.describe()

Unnamed: 0,C_CUSTKEY,C_NATIONKEY,C_ACCTBAL,C_COMMENT
count,150000.0,150000.0,150000.0,0.0
mean,75000.5,12.01,4495.51,
std,43301.41,7.21,3174.32,
min,1.0,0.0,-999.99,
25%,37500.75,6.0,1757.62,
50%,75000.5,12.0,4477.3,
75%,112500.25,18.0,7246.32,
max,150000.0,24.0,9999.99,


In [11]:
supplier_df.describe()

Unnamed: 0,S_SUPPKEY,S_NATIONKEY,S_ACCTBAL
count,10000.0,10000.0,10000.0
mean,5000.5,11.94,4510.35
std,2886.9,7.24,3168.08
min,1.0,0.0,-998.22
25%,2500.75,6.0,1770.72
50%,5000.5,12.0,4541.07
75%,7500.25,18.0,7270.64
max,10000.0,24.0,9999.72


In [12]:
customer_df.head()

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
0,60001,Customer#000060001,9Ii4zQn9cX,14,24-678-784-9652,9957.56,HOUSEHOLD,
1,60002,Customer#000060002,ThGBMjDwKzkoOxhz,15,25-782-500-8435,742.46,BUILDING,
2,60003,Customer#000060003,Ed hbPtTXMTAsgGhCr4HuTzKMd2,16,26-859-847-7640,2526.92,BUILDING,
3,60004,Customer#000060004,NivCT2RVaavlyUnKwBjDyMvB42WayXCnky,10,20-573-674-7999,7975.22,AUTOMOBILE,
4,60005,Customer#000060005,1F3KM3ccEXEtI B22XmCMOWJMl,12,22-741-208-1316,2504.74,MACHINERY,


In [13]:
supplier_df.head()

Unnamed: 0,S_SUPPKEY,S_NAME,S_ADDRESS,S_NATIONKEY,S_PHONE,S_ACCTBAL,S_COMMENT
0,1,Supplier#000000001,"N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ",17,27-918-335-1736,5755.94,each slyly above the careful
1,2,Supplier#000000002,"89eJ5ksX3ImxJQBvxObC,",5,15-679-861-2259,4032.68,slyly bold instructions. idle dependen
2,3,Supplier#000000003,"q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3",1,11-383-516-1199,4192.4,blithely silent requests after the express dep...
3,4,Supplier#000000004,Bk7ah4CK8SYQTepEmvMkkgMwg,15,25-843-787-7479,4641.08,riously even requests above the exp
4,5,Supplier#000000005,Gcdm2rJRzl5qlTVzc,11,21-151-690-3663,-283.84,. slyly regular pinto bea


#### Entity Relationship

<img src="images/dataset_entity_relationship.png">

## Data Cleansing

#### Business logic for LineItem table's field
- Shouldn't be any duplicate row (reference by value in all column) >> Move duplicate column cleansing activity to top
- L_QUANTITY should have value more than 0, not be decimal number, not be negative value and should not contain NaN value
- L_LINENUMBER should have value more than 0, not be decimal number, not be negative value and not contain NaN value
- L_LINESTATUS should contain only "F", "O" status and should not contain NaN value >> *To be check, didn't cleaned in this lab*
- L_SHIPMODE should not contain NaN value and only value list below allowed >> *To be check, didn't cleaned in this lab*
    - TRUCK
    - AIR
    - MAIL
    - SHIP
    - RAIL
    - FOB
    - REG AIR

#### Lineitem

In [14]:
# Show duplicate record
lineitem_df[lineitem_df.duplicated(keep=False)].sort_values(lineitem_df.columns.tolist())

Unnamed: 0,L_ORDERKEY,L_PARTKEY,L_SUPPKEY,L_LINENUMBER,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT,L_TAX,L_RETURNFLAG,L_LINESTATUS,L_SHIPDATE,L_COMMITDATE,L_RECEIPTDATE,L_SHIPINSTRUCT,L_SHIPMODE,L_COMMENT
69,600065,20667,3170,,27.00,42866.82,0.01,0.02,R,F,1993-12-06,1993-10-24,1993-12-25,DELIVER IN PERSON,TRUCK,
70,600065,20667,3170,,27.00,42866.82,0.01,0.02,R,F,1993-12-06,1993-10-24,1993-12-25,DELIVER IN PERSON,TRUCK,
86,600071,158959,3990,2.00,37.00,74664.15,0.06,0.07,R,F,1992-09-19,1992-08-29,1992-10-16,TAKE BACK RETURN,FOFOB,
87,600071,158959,3990,2.00,37.00,74664.15,0.06,0.07,R,F,1992-09-19,1992-08-29,1992-10-16,TAKE BACK RETURN,FOFOB,
129,600128,149878,4907,2.00,-39.00,75186.93,0.09,0.07,N,O,1996-03-27,1996-02-03,1996-04-25,DELIVER IN PERSON,RAIL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1100952,4648677,48705,3714,7.00,37.00,61186.90,0.08,0.05,A,F,1992-09-18,1992-10-03,1992-09-24,NONE,FOB,
1100996,4648737,8444,5945,3.00,10.00,13524.40,0.05,0.02,R,F,1993-01-25,1993-01-11,1993-02-13,TAKE BACK RETURN,,
1100997,4648737,8444,5945,3.00,10.00,13524.40,0.05,0.02,R,F,1993-01-25,1993-01-11,1993-02-13,TAKE BACK RETURN,,
1100993,4648737,9990,7491,1.00,43.00,81699.57,0.07,0.04,A,,1993-01-17,1993-01-23,1993-01-24,TAKE BACK RETURN,TRUCK,


In [15]:
# Remove duplicate records
lineitem_df = lineitem_df.drop_duplicates()

In [16]:
# Drop records that contains nan
lineitem_df = lineitem_df.dropna(subset=['L_LINENUMBER','L_QUANTITY','L_LINESTATUS','L_SHIPMODE'])

# There are some other method to deal with na by filling the records with other value
#lineitem_df['L_LINENUMBER'] = lineitem_df['L_LINENUMBER'].ffill()
#lineitem_df['L_QUANTITY'] = lineitem_df['L_QUANTITY'].bfill()
#lineitem_df['L_LINESTATUS'] = lineitem_df['L_LINESTATUS'].fillna(lineitem_df['L_LINESTATUS'].mode())
#lineitem_df['L_SHIPMODE'] = lineitem_df['L_SHIPMODE'].fillna(lineitem_df['L_SHIPMODE'].mode())

In [17]:
# Convert decimal to integer (cut decimal) and inverse negative
lineitem_df['L_LINENUMBER'] = lineitem_df['L_LINENUMBER'].astype(int) 
lineitem_df['L_LINENUMBER'] = lineitem_df['L_LINENUMBER'].abs() # For more clarification ask business for clarification - abs, outlier, drop

In [18]:
# LineNumber should not contain 0
lineitem_df = lineitem_df[lineitem_df['L_LINENUMBER']!=0].copy()

In [19]:
# Convert decimal to integer (cut decimal) and inverse negative
lineitem_df['L_QUANTITY'] = lineitem_df['L_QUANTITY'].astype(int)
lineitem_df['L_QUANTITY'] = lineitem_df['L_QUANTITY'].abs() # For more clarification ask business for clarification - abs, outlier, drop

In [20]:
# Quantity should have value more than 0
lineitem_df = lineitem_df[lineitem_df['L_QUANTITY']!=0].copy()

In [21]:
# Show number of record group by each value
lineitem_df['L_LINESTATUS'].value_counts()

L_LINESTATUS
O     398144
F     397370
QF       238
PO       228
FO       225
       ...  
iF       175
LO       174
VO       174
YF       170
tF       167
Name: count, Length: 106, dtype: int64

In [22]:
# Replace out-of-scope row with correct value for LINESTATUS column
lineitem_df.loc[(lineitem_df['L_LINESTATUS'].str.contains('F')) & ~(lineitem_df['L_LINESTATUS'].str.contains('O')), 'L_LINESTATUS'] = 'F'
lineitem_df.loc[(lineitem_df['L_LINESTATUS'].str.contains('O')) & ~(lineitem_df['L_LINESTATUS'].str.contains('F')), 'L_LINESTATUS'] = 'O'

In [23]:
# Remove any leftover out-of-scope value for LINESTATUS column
lineitem_df = lineitem_df[lineitem_df['L_LINESTATUS'].isin(['F','O'])]

In [24]:
# Show number of record group by each value
lineitem_df['L_SHIPMODE'].value_counts()

L_SHIPMODE
AIR         111416
RAIL        111035
TRUCK       110915
SHIP        110593
REG AIR     110549
             ...  
REG AIVR         3
RvEG AIR         3
REG nAIR         2
RyEG AIR         2
NREG AIR         2
Name: count, Length: 1572, dtype: int64

In [25]:
# Remove out-of-scope row for LINESTATUS column
lineitem_df = lineitem_df[lineitem_df['L_SHIPMODE'].isin(['TRUCK', 'AIR', 'MAIL', 'SHIP', 'RAIL', 'FOB', 'REG AIR'])]

#### Business logic for Order table's field
- Shouldn't be any duplicate row (reference by value in all column)
- O_ORDERSTATUS should contain only "F", "O" and "P" status and not contain NaN value
- O_ORDERDATE valid order year should in between 1992-1998 and should not contain NaN value
- O_ORDERPRIORITY should not contain NaN value and only value list below allowed
    
    - 1-URGENT
    - 2-HIGH
    - 3-MEDIUM
    - 4-LOW
    - 5-NOT SPECIFIED 


#### Order

In [74]:
order_df = pd.read_csv(f"{data_dir}/ORDER.csv")

In [75]:
# Show duplicate records
order_df[order_df.duplicated(keep=False)].sort_values(order_df.columns.tolist())

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
157599,1200069,127766,O,310546.13,1996-08-01,3-MEDIUM,Clerk#000000756,0,comment
157600,1200069,127766,O,310546.13,1996-08-01,3-MEDIUM,Clerk#000000756,0,comment
157616,1200133,69032,F,9702.25,1992-01-11,3-MEDIUM,Clerk#000000265,0,comment
157617,1200133,69032,F,9702.25,1992-01-11,3-MEDIUM,Clerk#000000265,0,comment
157618,1200134,113818,F,118312.16,1992-09-06,4-NOT SPECIFIED,Clerk#000000933,0,comment
...,...,...,...,...,...,...,...,...,...
787382,5999936,60281,F,3696.95,1992-09-28,5-LOW,Clerk#000000999,0,comment
787384,5999938,42247,O,232300.89,1997-01-27,3-MEDIUM,Clerk#000000100,0,comment
787385,5999938,42247,O,232300.89,1997-01-27,3-MEDIUM,Clerk#000000100,0,comment
787388,5999941,32512,,102961.29,1998-03-26,5-LOW,Clerk#000000927,0,comment


In [76]:
# Remove duplicate records
order_df = order_df.drop_duplicates()

In [77]:
order_df

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
0,4200001,13726,F,99406.41,1994-02-21,,Clerk#000000128,0,comment
1,4200002,129376,O,256838.41,1997-04-14,4-NOT SPECIFIED,Clerk#000000281,0,comment
2,4200003,141613,O,150849.49,1997-11-24,4-NOT SPECIFIED,Clerk#000000585,0,comment
3,4200004,23515,O,178688.27,1996-12-09,,Clerk#000000632,0,comment
4,4200005,97687,O,261742.31,,2-HIGH,Clerk#000000562,0,comment
...,...,...,...,...,...,...,...,...,...
1100998,2394275,109685,O,265359.33,1997-03-21,4-NOTM SPECIFIED,Clerk#000000700,0,comment
1100999,2394276,114304,F,236761.91,1957-08-09,2-HIGH,Clerk#000000992,0,comment
1101000,2394277,33689,F,264339.36,1994-06-08,2-HIGH,Clerk#000000439,0,comment
1101001,2394278,110135,F,105510.85,1992-02-28,5-LOW,Clerk#000000446,0,comment


In [78]:
order_df_col = order_df.columns

In [79]:
# Drop nan records
order_df = order_df.dropna(subset=['O_ORDERSTATUS','O_ORDERDATE','O_ORDERPRIORITY'])

In [80]:
# Show number of record group by each value
order_df['O_ORDERPRIORITY'].value_counts()

O_ORDERPRIORITY
4-NOT SPECIFIED     169838
5-LOW               169728
1-URGENT            169668
2-HIGH              169529
3-MEDIUM            169026
                     ...  
4-NOT TSPECIFIED         1
4-NObT SPECIFIED         1
4-NOT SBPECIFIED         1
4Z-NOT SPECIFIED         1
4-NOGT SPECIFIED         1
Name: count, Length: 2199, dtype: int64

In [87]:
order_df.loc[~(order_df['O_ORDERPRIORITY'].isin(['1-URGENT', '2-HIGH', '3-MEDIUM', '4-LOW', '5-NOT SPECIFIED']))]['O_ORDERPRIORITY'].value_counts()

Series([], Name: count, dtype: int64)

In [86]:
# Fix records with typo
# Final value: 1-URGENT, 2-HIGH, 3-MEDIUM, 4-LOW, 5-NOT SPECIFIED
# Replace out-of-scope row with correct value for LINESTATUS column
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('1')), 'O_ORDERPRIORITY'] = '1-URGENT'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('2')), 'O_ORDERPRIORITY'] = '2-HIGH'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('3')), 'O_ORDERPRIORITY'] = '3-MEDIUM'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('4')), 'O_ORDERPRIORITY'] = '5-NOT SPECIFIED'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('5')), 'O_ORDERPRIORITY'] = '4-LOW'

order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('-LOW')), 'O_ORDERPRIORITY'] = '1-URGENT'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('-HIGH')), 'O_ORDERPRIORITY'] = '2-HIGH'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('-URGENT')), 'O_ORDERPRIORITY'] = '3-MEDIUM'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('-MEDIUM')), 'O_ORDERPRIORITY'] = '5-NOT SPECIFIED'
order_df.loc[(order_df['O_ORDERPRIORITY'].str.contains('-NOT SPECIFIED')), 'O_ORDERPRIORITY'] = '4-LOW'

In [88]:
# Remove any leftover records with typo
########################################################################################
order_df = order_df[order_df['O_ORDERPRIORITY'].isin(['1-URGENT', '2-HIGH', '3-MEDIUM', '4-LOW', '5-NOT SPECIFIED'])]


In [90]:
# Show number of record group by each value
order_df['O_ORDERSTATUS'].value_counts()

O_ORDERSTATUS
O     424904
F     422701
P      22269
cF       255
fO       247
       ...  
UP         7
zP         6
gP         6
sP         5
YP         5
Name: count, Length: 159, dtype: int64

In [91]:
# Fix records with typo
# Final value: F, O, P
order_df.loc[(order_df['O_ORDERSTATUS'].str.contains('F')) & ~(order_df['O_ORDERSTATUS'].str.contains('O|P')), 'O_ORDERSTATUS'] = 'F'
order_df.loc[(order_df['O_ORDERSTATUS'].str.contains('O')) & ~(order_df['O_ORDERSTATUS'].str.contains('F|P')), 'O_ORDERSTATUS'] = 'O'
order_df.loc[(order_df['O_ORDERSTATUS'].str.contains('P')) & ~(order_df['O_ORDERSTATUS'].str.contains('F|O')), 'O_ORDERSTATUS'] = 'P'

In [92]:
# Remove any leftover records with typo
order_df = order_df[order_df['O_ORDERSTATUS'].isin(['F','O','P'])]

In [93]:
# Extract day, month and year from ORDERDATE (datetime format)
order_df['O_ORDERDATE'] = pd.to_datetime(order_df['O_ORDERDATE'])

order_df['O_ORDERYEAR'] = order_df['O_ORDERDATE'].dt.year
order_df['O_ORDERDAY'] = order_df['O_ORDERDATE'].dt.day
order_df['O_ORDERMONTH'] = order_df['O_ORDERDATE'].dt.month

In [94]:
# Show number of record group by year
order_df['O_ORDERYEAR'].value_counts()

O_ORDERYEAR
1996    129416
1997    129021
1995    128975
1993    128594
1994    128215
         ...  
2047       251
2022       184
1942       154
1968        92
2048        82
Name: count, Length: 61, dtype: int64

In [95]:
# Remove invalid order year
order_df = order_df[order_df['O_ORDERYEAR'].between(1992, 1998)]

#order_df = order_df[order_df['O_ORDERYEAR'].ge(1992) & order_df['O_ORDERYEAR'].le(1998)]
#order_df = order_df[(order_df['O_ORDERYEAR'] >= 1992) & (order_df['O_ORDERYEAR'] <= 1998)]

## Data Validation

#### LineItem

In [96]:
 # Show dataframe information
lineitem_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 775160 entries, 0 to 1101001
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   L_ORDERKEY       775160 non-null  int64  
 1   L_PARTKEY        775160 non-null  int64  
 2   L_SUPPKEY        775160 non-null  int64  
 3   L_LINENUMBER     775160 non-null  int64  
 4   L_QUANTITY       775160 non-null  int64  
 5   L_EXTENDEDPRICE  775160 non-null  float64
 6   L_DISCOUNT       775160 non-null  float64
 7   L_TAX            775160 non-null  float64
 8   L_RETURNFLAG     775160 non-null  object 
 9   L_LINESTATUS     775160 non-null  object 
 10  L_SHIPDATE       775160 non-null  object 
 11  L_COMMITDATE     775160 non-null  object 
 12  L_RECEIPTDATE    775160 non-null  object 
 13  L_SHIPINSTRUCT   775160 non-null  object 
 14  L_SHIPMODE       775160 non-null  object 
 15  L_COMMENT        0 non-null       float64
dtypes: float64(4), int64(5), object(7)
memory 

In [97]:
# Check duplicate row
lineitem_duplicate_check = lineitem_df.duplicated().sum()

print("LineItem dataset contain " + str(lineitem_duplicate_check) + " duplicate rows")

LineItem dataset contain 0 duplicate rows


In [98]:
# Check NaN value in specific column
quantity_nan_check = lineitem_df['L_QUANTITY'].isnull().sum()
linenumber_nan_check = lineitem_df['L_LINENUMBER'].isnull().sum()
linestatus_nan_check = lineitem_df['L_LINESTATUS'].isnull().sum()
shipmode_nan_check = lineitem_df['L_SHIPMODE'].isnull().sum()

print("L_QUANTITY have " + str(quantity_nan_check) + " NaN row")
print("L_LINENUMBER have " + str(linenumber_nan_check) + " NaN row")
print("L_LINESTATUS have " + str(linestatus_nan_check) + " NaN row")
print("L_SHIPMODE have " + str(shipmode_nan_check) + " NaN row")

L_QUANTITY have 0 NaN row
L_LINENUMBER have 0 NaN row
L_LINESTATUS have 0 NaN row
L_SHIPMODE have 0 NaN row


In [99]:
# Check 0 or negative value in specific column
quantity_negativeorzero_check = len(lineitem_df[lineitem_df['L_QUANTITY']<=0])
linenumber_negativeorzero_check = len(lineitem_df[lineitem_df['L_LINENUMBER']<=0])

print("L_QUANTITY contain " + str(quantity_negativeorzero_check) + " rows zero or negative value")
print("L_LINENUMBER contain " + str(linenumber_negativeorzero_check) + " rows zero or negative value")

L_QUANTITY contain 0 rows zero or negative value
L_LINENUMBER contain 0 rows zero or negative value


In [104]:
# Check out-of-scope value in specific
linestatus_outofscope_check = len(lineitem_df[~lineitem_df['L_LINESTATUS'].isin(['F', 'O'])])
shipmode_outofscope_check = len(lineitem_df[~lineitem_df['L_SHIPMODE'].isin(['TRUCK', 'AIR', 'MAIL', 'SHIP', 'RAIL', 'FOB', 'REG AIR'])])

print("L_LINESTATUS contain " + str(linestatus_outofscope_check) + " rows out-of-scope value")
print("L_SHIPMODE contain " + str(shipmode_outofscope_check) + " rows out-of-scope value")

L_LINESTATUS contain 0 rows out-of-scope value
L_SHIPMODE contain 0 rows out-of-scope value


#### Order

In [105]:
# Show dataframe information
order_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 847307 entries, 1 to 1101002
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   O_ORDERKEY       847307 non-null  int64         
 1   O_CUSTKEY        847307 non-null  int64         
 2   O_ORDERSTATUS    847307 non-null  object        
 3   O_TOTALPRICE     847307 non-null  float64       
 4   O_ORDERDATE      847307 non-null  datetime64[ns]
 5   O_ORDERPRIORITY  847307 non-null  object        
 6   O_CLERK          847307 non-null  object        
 7   O_SHIPPRIORITY   847307 non-null  int64         
 8   O_COMMENT        847307 non-null  object        
 9   O_ORDERYEAR      847307 non-null  int32         
 10  O_ORDERDAY       847307 non-null  int32         
 11  O_ORDERMONTH     847307 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(4)
memory usage: 74.3+ MB


In [106]:
# Check duplicate row
order_duplicate_check = order_df.duplicated().sum()

print(f"LineItem dataset contain {str(order_duplicate_check)} duplicate rows")

LineItem dataset contain 0 duplicate rows


In [107]:
# Check NaN value in specific column
orderstatus_nan_check = order_df['O_ORDERSTATUS'].isnull().sum()
orderdate_nan_check = order_df['O_ORDERDATE'].isnull().sum()
orderpriority_nan_check = order_df['O_ORDERPRIORITY'].isnull().sum()

print("O_ORDERSTATUS have " + str(orderstatus_nan_check) + " NaN row")
print("O_ORDERDATE have " + str(orderdate_nan_check) + " NaN row")
print("O_ORDERPRIORITY have " + str(orderpriority_nan_check) + " NaN row")

O_ORDERSTATUS have 0 NaN row
O_ORDERDATE have 0 NaN row
O_ORDERPRIORITY have 0 NaN row


In [108]:
# Check out-of-scope value in specific column
orderstatus_outofscope_check = len(order_df[~order_df['O_ORDERSTATUS'].isin(['F', 'O', 'P'])])
orderpriotity_outofscope_check = len(order_df[~order_df['O_ORDERPRIORITY'].isin(['1-URGENT', '2-HIGH', '3-MEDIUM', '4-LOW', '5-NOT SPECIFIED'])])

print("O_ORDERSTATUS contain " + str(orderstatus_outofscope_check) + " rows out-of-scope value")
print("O_ORDERPRIORITY contain " + str(orderpriotity_outofscope_check) + " rows out-of-scope value")

O_ORDERSTATUS contain 0 rows out-of-scope value
O_ORDERPRIORITY contain 0 rows out-of-scope value


In [109]:
# Check valid year for date column
orderdate_valid_year_check = len(order_df[~order_df['O_ORDERYEAR'].between(1992,1998)])

print("O_ORDERDATE contain " + str(orderdate_valid_year_check) + " invalid year (not in 1992-1998)")

O_ORDERDATE contain 0 invalid year (not in 1992-1998)


## Data Manipulation

### Denomalization

<img src="images/data_manipulation_denormalization.png">

- Order < Customer < Nation < Region
- Supplier < Nation < Region
- LineItem
- PartSupp
- Part

### Join Data

#### Join Nation with Region

In [110]:
NR = pd.merge(left=nation_df,
               right=region_df,
               left_on='N_REGIONKEY',
               right_on='R_REGIONKEY',
               how='left')

NR = NR[['N_NATIONKEY','N_NAME','R_NAME']]
NR = NR.rename(columns={"N_NAME": "N_NATION", 
                          "R_NAME": "N_REGION"})

In [111]:
NR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   N_NATIONKEY  25 non-null     int64 
 1   N_NATION     25 non-null     object
 2   N_REGION     25 non-null     object
dtypes: int64(1), object(2)
memory usage: 732.0+ bytes


In [112]:
NR.head()

Unnamed: 0,N_NATIONKEY,N_NATION,N_REGION
0,0,ALGERIA,AFRICA
1,1,ARGENTINA,AMERICA
2,2,BRAZIL,AMERICA
3,3,CANADA,AMERICA
4,4,EGYPT,MIDDLE EAST


In [116]:
# Compare data between header table and joined dataframe
print("Number of row in header table (Nation) before join : " + str(len(nation_df)))
print("Number of row after joined : " + str(len(NR)))


Number of row in header table (Nation) before join : 25
Number of row after joined : 25


#### Join Customer with Nation and Region

In [118]:
CNR = pd.merge(left=customer_df,
                 right=NR,
                 left_on='C_NATIONKEY',
                 right_on='N_NATIONKEY',
                 how='left')

CNR = CNR.drop(['N_NATIONKEY','C_COMMENT'],axis=1)
CNR = CNR.rename(columns={"N_NATION": "C_NATION",
                              "N_REGION": "C_REGION"})

In [119]:
CNR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   C_CUSTKEY     150000 non-null  int64  
 1   C_NAME        150000 non-null  object 
 2   C_ADDRESS     150000 non-null  object 
 3   C_NATIONKEY   150000 non-null  int64  
 4   C_PHONE       150000 non-null  object 
 5   C_ACCTBAL     150000 non-null  float64
 6   C_MKTSEGMENT  150000 non-null  object 
 7   C_NATION      150000 non-null  object 
 8   C_REGION      150000 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 10.3+ MB


In [120]:
CNR.head()

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_NATION,C_REGION
0,60001,Customer#000060001,9Ii4zQn9cX,14,24-678-784-9652,9957.56,HOUSEHOLD,KENYA,AFRICA
1,60002,Customer#000060002,ThGBMjDwKzkoOxhz,15,25-782-500-8435,742.46,BUILDING,MOROCCO,AFRICA
2,60003,Customer#000060003,Ed hbPtTXMTAsgGhCr4HuTzKMd2,16,26-859-847-7640,2526.92,BUILDING,MOZAMBIQUE,AFRICA
3,60004,Customer#000060004,NivCT2RVaavlyUnKwBjDyMvB42WayXCnky,10,20-573-674-7999,7975.22,AUTOMOBILE,IRAN,MIDDLE EAST
4,60005,Customer#000060005,1F3KM3ccEXEtI B22XmCMOWJMl,12,22-741-208-1316,2504.74,MACHINERY,JAPAN,ASIA


In [121]:
# Compare data between header table and joined dataframe
print("Number of row in header table (Customer) before join : " + str(len(customer_df)))
print("Number of row after joined : " + str(len(CNR)))

Number of row in header table (Customer) before join : 150000
Number of row after joined : 150000


#### Join Order with Customer, Nation and Region

In [128]:
OCNR = pd.merge(left=order_df,
                right=CNR,
                left_on="O_CUSTKEY",
                right_on="C_CUSTKEY",
                how='left')

#OCNR = OCNR.drop(######################)
OCNR = OCNR.drop(['C_CUSTKEY'],axis=1)

In [129]:
OCNR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847307 entries, 0 to 847306
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   O_ORDERKEY       847307 non-null  int64         
 1   O_CUSTKEY        847307 non-null  int64         
 2   O_ORDERSTATUS    847307 non-null  object        
 3   O_TOTALPRICE     847307 non-null  float64       
 4   O_ORDERDATE      847307 non-null  datetime64[ns]
 5   O_ORDERPRIORITY  847307 non-null  object        
 6   O_CLERK          847307 non-null  object        
 7   O_SHIPPRIORITY   847307 non-null  int64         
 8   O_COMMENT        847307 non-null  object        
 9   O_ORDERYEAR      847307 non-null  int32         
 10  O_ORDERDAY       847307 non-null  int32         
 11  O_ORDERMONTH     847307 non-null  int32         
 12  C_NAME           847307 non-null  object        
 13  C_ADDRESS        847307 non-null  object        
 14  C_NATIONKEY      847

In [130]:
OCNR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847307 entries, 0 to 847306
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   O_ORDERKEY       847307 non-null  int64         
 1   O_CUSTKEY        847307 non-null  int64         
 2   O_ORDERSTATUS    847307 non-null  object        
 3   O_TOTALPRICE     847307 non-null  float64       
 4   O_ORDERDATE      847307 non-null  datetime64[ns]
 5   O_ORDERPRIORITY  847307 non-null  object        
 6   O_CLERK          847307 non-null  object        
 7   O_SHIPPRIORITY   847307 non-null  int64         
 8   O_COMMENT        847307 non-null  object        
 9   O_ORDERYEAR      847307 non-null  int32         
 10  O_ORDERDAY       847307 non-null  int32         
 11  O_ORDERMONTH     847307 non-null  int32         
 12  C_NAME           847307 non-null  object        
 13  C_ADDRESS        847307 non-null  object        
 14  C_NATIONKEY      847

In [131]:
OCNR.head()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT,O_ORDERYEAR,O_ORDERDAY,O_ORDERMONTH,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_NATION,C_REGION
0,4200002,129376,O,256838.41,1997-04-14,4-LOW,Clerk#000000281,0,comment,1997,14,4,Customer#000129376,3ZHy4RbofQRWdopofrOBELvRaEJqeIu,3,13-963-672-5643,-730.88,AUTOMOBILE,CANADA,AMERICA
1,4200003,141613,O,150849.49,1997-11-24,4-LOW,Clerk#000000585,0,comment,1997,24,11,Customer#000141613,dPodRyFNKQHMZxpU,12,22-455-738-5858,880.03,HOUSEHOLD,JAPAN,ASIA
2,4200006,49576,F,263616.19,1993-12-31,4-LOW,Clerk#000000650,0,comment,1993,31,12,Customer#000049576,FCWMIWtDvjmi6XneYUmCPVdLJ,1,11-835-830-8347,9288.93,BUILDING,ARGENTINA,AMERICA
3,4200032,72820,F,64160.97,1994-12-21,4-LOW,Clerk#000000302,0,comment,1994,21,12,Customer#000072820,G Ukz5qar5Tu4awO27vPu7KNlpYeOZvFpt0f,18,28-984-250-1868,7927.21,FURNITURE,CHINA,ASIA
4,4200033,22528,O,218653.88,1996-03-24,4-LOW,Clerk#000000868,0,comment,1996,24,3,Customer#000022528,VtfsGb0Rx5LgAHV5vfIh,13,23-735-561-2839,9516.88,FURNITURE,JORDAN,MIDDLE EAST


In [132]:
# Compare data between header table and joined dataframe
print("Number of row in header table (Order) before join : " + str(len(order_df)))
print("Number of row after joined : " + str(len(OCNR)))

Number of row in header table (Order) before join : 847307
Number of row after joined : 847307


#### Join Supplier with Nation and Region

In [133]:
SNR = pd.merge(left=supplier_df,
                         right=NR,
                         left_on='S_NATIONKEY',
                         right_on='N_NATIONKEY',
                         how='left')

#SNR = SNR[['S_SUPPKEY','S_NAME','S_ACCTBAL','NATION','REGION']]
SNR = SNR.drop(['N_NATIONKEY'],axis=1)
SNR = SNR.rename(columns={"N_NATION": "S_NATION",
                              "N_REGION": "S_REGION"})

In [134]:
SNR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   S_SUPPKEY    10000 non-null  int64  
 1   S_NAME       10000 non-null  object 
 2   S_ADDRESS    10000 non-null  object 
 3   S_NATIONKEY  10000 non-null  int64  
 4   S_PHONE      10000 non-null  object 
 5   S_ACCTBAL    10000 non-null  float64
 6   S_COMMENT    10000 non-null  object 
 7   S_NATION     10000 non-null  object 
 8   S_REGION     10000 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB


In [135]:
SNR.head()

Unnamed: 0,S_SUPPKEY,S_NAME,S_ADDRESS,S_NATIONKEY,S_PHONE,S_ACCTBAL,S_COMMENT,S_NATION,S_REGION
0,1,Supplier#000000001,"N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ",17,27-918-335-1736,5755.94,each slyly above the careful,PERU,AMERICA
1,2,Supplier#000000002,"89eJ5ksX3ImxJQBvxObC,",5,15-679-861-2259,4032.68,slyly bold instructions. idle dependen,ETHIOPIA,AFRICA
2,3,Supplier#000000003,"q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3",1,11-383-516-1199,4192.4,blithely silent requests after the express dep...,ARGENTINA,AMERICA
3,4,Supplier#000000004,Bk7ah4CK8SYQTepEmvMkkgMwg,15,25-843-787-7479,4641.08,riously even requests above the exp,MOROCCO,AFRICA
4,5,Supplier#000000005,Gcdm2rJRzl5qlTVzc,11,21-151-690-3663,-283.84,. slyly regular pinto bea,IRAQ,MIDDLE EAST


In [136]:
# Compare data between header table and joined dataframe
print("Number of row in header table (Supplier) before join : " + str(len(supplier_df)))
print("Number of row after joined : " + str(len(SNR)))

Number of row in header table (Supplier) before join : 10000
Number of row after joined : 10000


### Feature Creation

In [137]:
# Calculate lead day
OCNR['LEADDAY'] = OCNR.sort_values(['C_NAME','O_ORDERDATE']).groupby(['C_NAME'],sort=False)['O_ORDERDATE'].diff()

In [138]:
# Extract day from lead day column
OCNR['LEADDAY'] = OCNR['LEADDAY'].dt.days

In [139]:
OCNR = OCNR.dropna(subset=['LEADDAY'])

In [140]:
OCNR['LEADDAY'].value_counts()

LEADDAY
2.00       3326
6.00       3285
4.00       3284
11.00      3248
7.00       3247
           ... 
1930.00       1
2129.00       1
1882.00       1
1850.00       1
2158.00       1
Name: count, Length: 2092, dtype: int64

## Analytics Product development

### Download Data for visualization in next lab

In [141]:
#Create output folder if not exist
if not os.path.exists("clean_data"):
    os.makedirs("clean_data")

data_dir = "clean_data"

#Download modeled data into clean data folder
OCNR.to_csv(f"{data_dir}/OCNR.csv")
SNR.to_csv(f"{data_dir}/SNR.csv")
lineitem_df.to_csv(f"{data_dir}/L.csv")
part_df.to_csv(f"{data_dir}/P.csv")
partsupp_df.to_csv(f"{data_dir}/PS.csv")