In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = r"D:\DS_Final_Project\Notebook\data\RAW_CSV"

customer = pd.read_csv(f"{path}\\customer.csv")
cust_order = pd.read_csv(f"{path}\\cust_order.csv")
customer_address_id = pd.read_csv(f"{path}\\customer_address.csv")
customer_address_details = pd.read_csv(f"{path}\\address.csv")
address_status = pd.read_csv(f"{path}\\address_status.csv")
order_history = pd.read_csv(f"{path}\\order_history.csv")
order_line = pd.read_csv(f"{path}\\order_line.csv")
order_status = pd.read_csv(f"{path}\\order_status.csv")
shipping_method = pd.read_csv(f"{path}\\shipping_method.csv")
book = pd.read_csv(f"{path}\\book.csv")
book_author_id = pd.read_csv(f"{path}\\book_author.csv")
book_author_name = pd.read_csv(f"{path}\\author.csv")
book_language = pd.read_csv(f"{path}\\book_language.csv")
publisher = pd.read_csv(f"{path}\\publisher.csv")



In [76]:
tables = [
    ("customer", customer),
    ("cust_order", cust_order),
    ("customer_address_id",customer_address_id),
    ("customer_address_details",customer_address_details),
    ("address_status", address_status),
    ("order_history", order_history),
    ("order_line", order_line),
    ("order_status", order_status),
    ("shipping_method", shipping_method),
    ("book",book),
    ("book_author_id",book_author_id),
    ("book_author_name",book_author_name),
    ("book_language",book_language),
    ("publisher",publisher)]

for name,df in tables:
    print(f"{name} : {list(df.columns)}")
    print("-"*50)


customer : ['customer_id', 'first_name', 'last_name', 'email']
--------------------------------------------------
cust_order : ['order_id', 'order_date', 'customer_id', 'shipping_method_id', 'dest_address_id']
--------------------------------------------------
customer_address_id : ['customer_id', 'address_id', 'status_id']
--------------------------------------------------
customer_address_details : ['address_id', 'street_number', 'street_name', 'city', 'country_id']
--------------------------------------------------
address_status : ['status_id', 'address_status']
--------------------------------------------------
order_history : ['history_id', 'order_id', 'status_id', 'status_date']
--------------------------------------------------
order_line : ['line_id', 'order_id', 'book_id', 'price']
--------------------------------------------------
order_status : ['status_id', 'status_value']
--------------------------------------------------
shipping_method : ['method_id', 'method_name', 'co

In [77]:
# checking dtypes 
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  2000 non-null   int64 
 1   first_name   2000 non-null   object
 2   last_name    2000 non-null   object
 3   email        2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB


In [79]:
# Inspect each table
for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    print(f"Table: {table_name}")
    print(df.info())
    print(df.head(), "\n")
    print("-"*100)


Table: customer
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  2000 non-null   int64 
 1   first_name   2000 non-null   object
 2   last_name    2000 non-null   object
 3   email        2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB
None
   customer_id first_name last_name                   email
0            1     Ursola     Purdy      upurdy0@cdbaby.com
1            2   Ruthanne    Vatini       rvatini1@fema.gov
2            3     Reidar   Turbitt  rturbitt2@geocities.jp
3            4       Rich     Kirsz      rkirsz3@jalbum.net
4            5    Carline     Kupis        ckupis4@tamu.edu 

----------------------------------------------------------------------------------------------------
Table: cust_order
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7550 entries, 0 to 7549
Data columns (total 5

**Observation**:

1. In cust_order order date seems to be object we need to convert that into datetime formate.
2. In customer_address_details street name and city seems to be object type we need to check for that.
3. In address_status - address status column seems to be categorical column type.
4. In order_history - status date seems to object. convert to datetime formate.
5. In order status - status value column seems to categorical column type.
6. In shipping_method - method_name column seems to be categorical.
7. In boob - title and publication_date column seems to object type we need formate it.
8. In book_author_name - author_name seems to be object type we need to verify whether it is cat column or not.
9. book_language - language_code & language_name seems to be object. Check whether it is a cat column or not.
10. publisher - publisher_name seems to be object type. check for the column type.

In [80]:
# Check for missing values in all datasets

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    print(f"Table: {table_name}")
    print(df.isnull().sum(), "\n")
    print("-"*50)


Table: customer
customer_id    0
first_name     0
last_name      0
email          0
dtype: int64 

--------------------------------------------------
Table: cust_order
order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
dtype: int64 

--------------------------------------------------
Table: customer_address_id
customer_id    0
address_id     0
status_id      0
dtype: int64 

--------------------------------------------------
Table: customer_address_details
address_id       0
street_number    0
street_name      0
city             0
country_id       0
dtype: int64 

--------------------------------------------------
Table: address_status
status_id         0
address_status    0
dtype: int64 

--------------------------------------------------
Table: order_history
history_id     0
order_id       0
status_id      0
status_date    0
dtype: int64 

--------------------------------------------------
Table: order_line
line_id 

**Observation:**
1. No missing values seen on any tables.

In [81]:
# Drop duplicates in all datasets
for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    duplicates = df.duplicated().sum()
    print(f"Table: {table_name} | Duplicates: {duplicates}")
    if duplicates > 0:
       df.drop_duplicates(inplace=True)


Table: customer | Duplicates: 0
Table: cust_order | Duplicates: 0
Table: customer_address_id | Duplicates: 0
Table: customer_address_details | Duplicates: 0
Table: address_status | Duplicates: 0
Table: order_history | Duplicates: 0
Table: order_line | Duplicates: 0
Table: order_status | Duplicates: 0
Table: shipping_method | Duplicates: 0
Table: book | Duplicates: 0
Table: book_author_id | Duplicates: 0
Table: book_author_name | Duplicates: 0
Table: book_language | Duplicates: 0
Table: publisher | Duplicates: 0


**Observation:
**1. No duplicates has been seen on any tables.

In [82]:
# Check for the column names are same before merging

def standardize_column_name(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ","_")
    return df
    
# Apply to all datasets
customer = standardize_column_name(customer)
cust_order = standardize_column_name(cust_order)
customer_address_id = standardize_column_name(customer_address_id)
customer_address_details = standardize_column_name(customer_address_details)
address_status = standardize_column_name(address_status)
order_history = standardize_column_name(order_history)
order_line = standardize_column_name(order_line)
order_status = standardize_column_name(order_status)
shipping_method = standardize_column_name(shipping_method)
book = standardize_column_name(book)
book_author_id = standardize_column_name(book_author_id)
book_author_name = standardize_column_name(book_author_name)
book_language = standardize_column_name(book_language)
publisher = standardize_column_name(publisher)

In [83]:
# Dictionary to map table names to their DataFrames
tables = {
    "customer": customer,
    "cust_order": cust_order,
    "customer_address_id": customer_address_id,
    "customer_address_details": customer_address_details,
    "address_status": address_status,
    "order_history": order_history,
    "order_line": order_line,
    "order_status": order_status,
    "shipping_method": shipping_method,
    "book": book,
    "book_author_id": book_author_id,
    "book_author_name": book_author_name,
    "book_language": book_language,
    "publisher": publisher,
}

# Dictionary to map table names to their primary keys
primary_keys = {
    "customer": "customer_id",
    "cust_order": "order_id",
    "customer_address_id": "address_id",
    "customer_address_details": "address_id",
    "address_status": "status_id",
    "order_history": "history_id",
    "order_line": "line_id",
    "order_status": "status_id",
    "shipping_method": "method_id",
    "book": "book_id",
    "book_author_id": "author_id",
    "book_author_name": "author_id",
    "book_language": "language_id",
    "publisher": "publisher_id",
}

# Loop through tables and check uniqueness of primary keys
for table_name, df in tables.items():
    # Get the primary key for the current table
    key = primary_keys.get(table_name)
    if key:
        # Check if the primary key column is unique
        unique = df[key].is_unique
        print(f"Table: {table_name} | Primary Key: {key} | Is Unique: {unique}")
    else:
        print(f"Table: {table_name} has no primary key defined.")


Table: customer | Primary Key: customer_id | Is Unique: True
Table: cust_order | Primary Key: order_id | Is Unique: True
Table: customer_address_id | Primary Key: address_id | Is Unique: False
Table: customer_address_details | Primary Key: address_id | Is Unique: True
Table: address_status | Primary Key: status_id | Is Unique: True
Table: order_history | Primary Key: history_id | Is Unique: True
Table: order_line | Primary Key: line_id | Is Unique: True
Table: order_status | Primary Key: status_id | Is Unique: True
Table: shipping_method | Primary Key: method_id | Is Unique: True
Table: book | Primary Key: book_id | Is Unique: True
Table: book_author_id | Primary Key: author_id | Is Unique: False
Table: book_author_name | Primary Key: author_id | Is Unique: True
Table: book_language | Primary Key: language_id | Is Unique: True
Table: publisher | Primary Key: publisher_id | Is Unique: True


In [84]:
# Checking the dtype " o " column in the tables:
obj_column = {}

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    
    obj_column[table_name] = []
    columns = list(df.columns)
    
    for col_name in columns:
        if df[col_name].dtype == "O":
            obj_column[table_name].append(col_name)
import pprint
pprint.pprint(obj_column)   


{'address_status': ['address_status'],
 'book': ['title', 'publication_date'],
 'book_author_id': [],
 'book_author_name': ['author_name'],
 'book_language': ['language_code', 'language_name'],
 'cust_order': ['order_date'],
 'customer': ['first_name', 'last_name', 'email'],
 'customer_address_details': ['street_name', 'city'],
 'customer_address_id': [],
 'order_history': ['status_date'],
 'order_line': [],
 'order_status': ['status_value'],
 'publisher': ['publisher_name'],
 'shipping_method': ['method_name']}


In [85]:
# 1st lets format the date column in cust_order tables.

cust_order['order_date'] = pd.to_datetime(cust_order['order_date'])
cust_order['order_day'] = cust_order.order_date.dt.day
cust_order['order_month'] = cust_order.order_date.dt.month
cust_order['order_year'] = cust_order.order_date.dt.year
cust_order['order_month_name'] = cust_order.order_date.dt.month_name()

cust_order.dtypes


order_id                       int64
order_date            datetime64[ns]
customer_id                    int64
shipping_method_id             int64
dest_address_id                int64
order_day                      int32
order_month                    int32
order_year                     int32
order_month_name              object
dtype: object

In [86]:
# Formatting the order history tables date column

order_history['status_date'] = pd.to_datetime(order_history['status_date'])
order_history['hist_status_day'] = order_history['status_date'].dt.day
order_history['hist_status_month'] = order_history['status_date'].dt.month
order_history['hist_status_year'] = order_history['status_date'].dt.year

order_history.dtypes

history_id                    int64
order_id                      int64
status_id                     int64
status_date          datetime64[ns]
hist_status_day               int32
hist_status_month             int32
hist_status_year              int32
dtype: object

In [88]:
# Formatting boo table publish date

book['publication_date'] = pd.to_datetime(book['publication_date'])
book['publish_day'] = book['publication_date'].dt.day
book['publish_month'] = book['publication_date'].dt.month
book['publish_year'] = book['publication_date'].dt.year
book.sample(5).dtypes

book_id                      int64
title                       object
isbn13                       int64
language_id                  int64
num_pages                    int64
publication_date    datetime64[ns]
publisher_id                 int64
publish_day                  int32
publish_month                int32
publish_year                 int32
dtype: object

In [96]:
# Converting to price column to numberic

order_line['price'] = pd.to_numeric(order_line['price'], errors='coerce')

In [None]:
# Removing trailing space in object data type.

# Checking the dtype " o " column in the tables:
obj_column = {}

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    
    obj_column[table_name] = []
    columns = list(df.columns)
    
    for col_name in columns:
        if df[col_name].dtype == "O":
            obj_column[table_name].append(col_name)
import pprint
#pprint.pprint(obj_column)

# Function to strip the space and lower the letter cases.

def remove_trailing_space(df):
    return df.str.strip().str.lower()

for table_name,col_name in obj_column.items():
    df = eval(table_name)
    for col in df:
        df[col_name] = df[col_name].apply(remove_trailing_space)

      


In [123]:
# Check sizes
print("Customer size:", customer.shape)
print("customer address id size:", customer_address_id.shape)
print("customer details size:", customer_address_details.shape)
print("address status size:", address_status.shape)



Customer size: (2000, 4)
customer address id size: (3350, 3)
customer details size: (1000, 5)
address status size: (2, 2)


In [126]:
print(customer.columns)
print(customer_address_id.columns)
print(customer_address_details.columns)
print(address_status.columns)

Index(['customer_id', 'first_name', 'last_name', 'email'], dtype='object')
Index(['customer_id', 'address_id', 'status_id'], dtype='object')
Index(['address_id', 'street_number', 'street_name', 'city', 'country_id'], dtype='object')
Index(['status_id', 'address_status'], dtype='object')


In [131]:
# MErging customer with customer address ID table

customer_data = pd.merge(customer,customer_address_id, how='left',on='customer_id')
customer_data = pd.merge(customer_data,customer_address_details,how='left',on='address_id')
customer_data = pd.merge(customer_data,address_status,how='left',on='status_id')
customer_data.sample(5)

Unnamed: 0,customer_id,first_name,last_name,email,address_id,status_id,street_number,street_name,city,country_id,address_status
2778,1669,vyky,joberne,vjoberneik@posterous.com,794,1,76842,butternut plaza,xiaowuzhan,42,active
1281,773,louie,winckle,lwincklelg@shop-pro.jp,374,1,51,manley parkway,nong khai,200,active
2853,1716,julissa,luckcock,jluckcockjv@vistaprint.com,588,1,5418,emmet parkway,maych’ew,66,active
204,124,modesta,gullefant,mgullefant3f@businessinsider.com,589,1,22,springview way,karangnunggal,92,active
2881,1732,garret,mckay,gmckaykb@netvibes.com,155,2,23789,summit road,tangjia,42,inactive


In [134]:
print(f"Shape of merged customer data : {customer_data.shape}")


Shape of merged customer data : (3350, 11)
