In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = r"D:\DS_Final_Project\Notebook\data\RAW_CSV"

customer = pd.read_csv(f"{path}\\customer.csv")
cust_order = pd.read_csv(f"{path}\\cust_order.csv")
customer_address_id = pd.read_csv(f"{path}\\customer_address.csv")
customer_address_details = pd.read_csv(f"{path}\\address.csv")
address_status = pd.read_csv(f"{path}\\address_status.csv")
order_history = pd.read_csv(f"{path}\\order_history.csv")
order_line = pd.read_csv(f"{path}\\order_line.csv")
order_status = pd.read_csv(f"{path}\\order_status.csv")
shipping_method = pd.read_csv(f"{path}\\shipping_method.csv")
book = pd.read_csv(f"{path}\\book.csv")
book_author_id = pd.read_csv(f"{path}\\book_author.csv")
book_author_name = pd.read_csv(f"{path}\\author.csv")
book_language = pd.read_csv(f"{path}\\book_language.csv")
publisher = pd.read_csv(f"{path}\\publisher.csv")



In [32]:
customer.customer_id.nunique()

2000

In [33]:
tables = [
    ("customer", customer),
    ("cust_order", cust_order),
    ("customer_address_id",customer_address_id),
    ("customer_address_details",customer_address_details),
    ("address_status", address_status),
    ("order_history", order_history),
    ("order_line", order_line),
    ("order_status", order_status),
    ("shipping_method", shipping_method),
    ("book",book),
    ("book_author_id",book_author_id),
    ("book_author_name",book_author_name),
    ("book_language",book_language),
    ("publisher",publisher)]

for name,df in tables:
    print(f"{name} : {list(df.columns)}")
    print("-"*50)


customer : ['customer_id', 'first_name', 'last_name', 'email']
--------------------------------------------------
cust_order : ['order_id', 'order_date', 'customer_id', 'shipping_method_id', 'dest_address_id']
--------------------------------------------------
customer_address_id : ['customer_id', 'address_id', 'status_id']
--------------------------------------------------
customer_address_details : ['address_id', 'street_number', 'street_name', 'city', 'country_id']
--------------------------------------------------
address_status : ['status_id', 'address_status']
--------------------------------------------------
order_history : ['history_id', 'order_id', 'status_id', 'status_date']
--------------------------------------------------
order_line : ['line_id', 'order_id', 'book_id', 'price']
--------------------------------------------------
order_status : ['status_id', 'status_value']
--------------------------------------------------
shipping_method : ['method_id', 'method_name', 'co

In [34]:
# checking dtypes 
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  2000 non-null   int64 
 1   first_name   2000 non-null   object
 2   last_name    2000 non-null   object
 3   email        2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB


In [35]:
customer.isna().sum()

customer_id    0
first_name     0
last_name      0
email          0
dtype: int64

In [36]:
customer[customer.isnull().all(axis=1)]

Unnamed: 0,customer_id,first_name,last_name,email


In [37]:
# Inspect each table
for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    print(f"Table: {table_name}")
    print(df.info())
    print(df.head(), "\n")
    print("-"*100)


Table: customer
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  2000 non-null   int64 
 1   first_name   2000 non-null   object
 2   last_name    2000 non-null   object
 3   email        2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB
None
   customer_id first_name last_name                   email
0            1     Ursola     Purdy      upurdy0@cdbaby.com
1            2   Ruthanne    Vatini       rvatini1@fema.gov
2            3     Reidar   Turbitt  rturbitt2@geocities.jp
3            4       Rich     Kirsz      rkirsz3@jalbum.net
4            5    Carline     Kupis        ckupis4@tamu.edu 

----------------------------------------------------------------------------------------------------
Table: cust_order
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7550 entries, 0 to 7549
Data columns (total 5

**Observation**:

1. In cust_order order date seems to be object we need to convert that into datetime formate.
2. In customer_address_details street name and city seems to be object type we need to check for that.
3. In address_status - address status column seems to be categorical column type.
4. In order_history - status date seems to object. convert to datetime formate.
5. In order status - status value column seems to categorical column type.
6. In shipping_method - method_name column seems to be categorical.
7. In boob - title and publication_date column seems to object type we need formate it.
8. In book_author_name - author_name seems to be object type we need to verify whether it is cat column or not.
9. book_language - language_code & language_name seems to be object. Check whether it is a cat column or not.
10. publisher - publisher_name seems to be object type. check for the column type.

In [38]:
# Check for missing values in all datasets

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    print(f"Table: {table_name}")
    print(df.isnull().sum(), "\n")
    print("-"*50)


Table: customer
customer_id    0
first_name     0
last_name      0
email          0
dtype: int64 

--------------------------------------------------
Table: cust_order
order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
dtype: int64 

--------------------------------------------------
Table: customer_address_id
customer_id    0
address_id     0
status_id      0
dtype: int64 

--------------------------------------------------
Table: customer_address_details
address_id       0
street_number    0
street_name      0
city             0
country_id       0
dtype: int64 

--------------------------------------------------
Table: address_status
status_id         0
address_status    0
dtype: int64 

--------------------------------------------------
Table: order_history
history_id     0
order_id       0
status_id      0
status_date    0
dtype: int64 

--------------------------------------------------
Table: order_line
line_id 

**Observation:**
1. No missing values seen on any tables.

In [39]:
# Drop duplicates in all datasets
for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    duplicates = df.duplicated().sum()
    print(f"Table: {table_name} | Duplicates: {duplicates}")
    if duplicates > 0:
       df.drop_duplicates(inplace=True)


Table: customer | Duplicates: 0
Table: cust_order | Duplicates: 0
Table: customer_address_id | Duplicates: 0
Table: customer_address_details | Duplicates: 0
Table: address_status | Duplicates: 0
Table: order_history | Duplicates: 0
Table: order_line | Duplicates: 0
Table: order_status | Duplicates: 0
Table: shipping_method | Duplicates: 0
Table: book | Duplicates: 0
Table: book_author_id | Duplicates: 0
Table: book_author_name | Duplicates: 0
Table: book_language | Duplicates: 0
Table: publisher | Duplicates: 0


**Observation:
**1. No duplicates has been seen on any tables.

In [40]:
# Check for the column names are same before merging

def standardize_column_name(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ","_")
    return df
    
# Apply to all datasets
customer = standardize_column_name(customer)
cust_order = standardize_column_name(cust_order)
customer_address_id = standardize_column_name(customer_address_id)
customer_address_details = standardize_column_name(customer_address_details)
address_status = standardize_column_name(address_status)
order_history = standardize_column_name(order_history)
order_line = standardize_column_name(order_line)
order_status = standardize_column_name(order_status)
shipping_method = standardize_column_name(shipping_method)
book = standardize_column_name(book)
book_author_id = standardize_column_name(book_author_id)
book_author_name = standardize_column_name(book_author_name)
book_language = standardize_column_name(book_language)
publisher = standardize_column_name(publisher)

In [41]:
# Dictionary to map table names to their DataFrames
tables = {
    "customer": customer,
    "cust_order": cust_order,
    "customer_address_id": customer_address_id,
    "customer_address_details": customer_address_details,
    "address_status": address_status,
    "order_history": order_history,
    "order_line": order_line,
    "order_status": order_status,
    "shipping_method": shipping_method,
    "book": book,
    "book_author_id": book_author_id,
    "book_author_name": book_author_name,
    "book_language": book_language,
    "publisher": publisher,
}

# Dictionary to map table names to their primary keys
primary_keys = {
    "customer": "customer_id",
    "cust_order": "order_id",
    "customer_address_id": "address_id",
    "customer_address_details": "address_id",
    "address_status": "status_id",
    "order_history": "history_id",
    "order_line": "line_id",
    "order_status": "status_id",
    "shipping_method": "method_id",
    "book": "book_id",
    "book_author_id": "author_id",
    "book_author_name": "author_id",
    "book_language": "language_id",
    "publisher": "publisher_id",
}

# Loop through tables and check uniqueness of primary keys
for table_name, df in tables.items():
    # Get the primary key for the current table
    key = primary_keys.get(table_name)
    if key:
        # Check if the primary key column is unique
        unique = df[key].is_unique
        print(f"Table: {table_name} | Primary Key: {key} | Is Unique: {unique}")
    else:
        print(f"Table: {table_name} has no primary key defined.")


Table: customer | Primary Key: customer_id | Is Unique: True
Table: cust_order | Primary Key: order_id | Is Unique: True
Table: customer_address_id | Primary Key: address_id | Is Unique: False
Table: customer_address_details | Primary Key: address_id | Is Unique: True
Table: address_status | Primary Key: status_id | Is Unique: True
Table: order_history | Primary Key: history_id | Is Unique: True
Table: order_line | Primary Key: line_id | Is Unique: True
Table: order_status | Primary Key: status_id | Is Unique: True
Table: shipping_method | Primary Key: method_id | Is Unique: True
Table: book | Primary Key: book_id | Is Unique: True
Table: book_author_id | Primary Key: author_id | Is Unique: False
Table: book_author_name | Primary Key: author_id | Is Unique: True
Table: book_language | Primary Key: language_id | Is Unique: True
Table: publisher | Primary Key: publisher_id | Is Unique: True


In [42]:
# Checking the dtype " o " column in the tables:
obj_column = {}

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    
    obj_column[table_name] = []
    columns = list(df.columns)
    
    for col_name in columns:
        if df[col_name].dtype == "O":
            obj_column[table_name].append(col_name)
import pprint
pprint.pprint(obj_column)   


{'address_status': ['address_status'],
 'book': ['title', 'publication_date'],
 'book_author_id': [],
 'book_author_name': ['author_name'],
 'book_language': ['language_code', 'language_name'],
 'cust_order': ['order_date'],
 'customer': ['first_name', 'last_name', 'email'],
 'customer_address_details': ['street_name', 'city'],
 'customer_address_id': [],
 'order_history': ['status_date'],
 'order_line': [],
 'order_status': ['status_value'],
 'publisher': ['publisher_name'],
 'shipping_method': ['method_name']}


In [43]:
# 1st lets format the date column in cust_order tables.

cust_order['order_date'] = pd.to_datetime(cust_order['order_date'])
cust_order['order_day'] = cust_order.order_date.dt.day
cust_order['order_month'] = cust_order.order_date.dt.month
cust_order['order_year'] = cust_order.order_date.dt.year
cust_order['order_month_name'] = cust_order.order_date.dt.month_name()

cust_order.dtypes


order_id                       int64
order_date            datetime64[ns]
customer_id                    int64
shipping_method_id             int64
dest_address_id                int64
order_day                      int32
order_month                    int32
order_year                     int32
order_month_name              object
dtype: object

In [44]:
# Formatting the order history tables date column

order_history['status_date'] = pd.to_datetime(order_history['status_date'])
order_history['hist_status_day'] = order_history['status_date'].dt.day
order_history['hist_status_month'] = order_history['status_date'].dt.month
order_history['hist_status_year'] = order_history['status_date'].dt.year

order_history.dtypes

history_id                    int64
order_id                      int64
status_id                     int64
status_date          datetime64[ns]
hist_status_day               int32
hist_status_month             int32
hist_status_year              int32
dtype: object

In [45]:
# Formatting boo table publish date

book['publication_date'] = pd.to_datetime(book['publication_date'])
book['publish_day'] = book['publication_date'].dt.day
book['publish_month'] = book['publication_date'].dt.month
book['publish_year'] = book['publication_date'].dt.year
book.sample(5).dtypes

book_id                      int64
title                       object
isbn13                       int64
language_id                  int64
num_pages                    int64
publication_date    datetime64[ns]
publisher_id                 int64
publish_day                  int32
publish_month                int32
publish_year                 int32
dtype: object

In [46]:
# Converting to price column to numberic

order_line['price'] = pd.to_numeric(order_line['price'], errors='coerce')

In [47]:
# Removing trailing space in object data type.

# Checking the dtype " o " column in the tables:
obj_column = {}

for table_name, df in zip(["customer", "cust_order", "customer_address_id", "customer_address_details", 
                           "address_status", "order_history", "order_line", "order_status", 
                           "shipping_method", "book", "book_author_id", "book_author_name", 
                           "book_language", "publisher"], 
                          [customer, cust_order, customer_address_id, customer_address_details, 
                           address_status, order_history, order_line, order_status, 
                           shipping_method, book, book_author_id, book_author_name, 
                           book_language, publisher]):
    
    obj_column[table_name] = []
    columns = list(df.columns)
    
    for col_name in columns:
        if df[col_name].dtype == "O":
            obj_column[table_name].append(col_name)
import pprint
#pprint.pprint(obj_column)

# Function to strip the space and lower the letter cases.

def remove_trailing_space(df):
    return df.str.strip().str.lower()

for table_name,col_name in obj_column.items():
    df = eval(table_name)
    for col in df:
        df[col_name] = df[col_name].apply(remove_trailing_space)

      


In [48]:
# Check sizes
print("Customer size:", customer.shape)
print("customer address id size:", customer_address_id.shape)
print("customer details size:", customer_address_details.shape)
print("address status size:", address_status.shape)



Customer size: (2000, 4)
customer address id size: (3350, 3)
customer details size: (1000, 5)
address status size: (2, 2)


In [49]:
print(customer.columns)
print(customer_address_id.columns)
print(customer_address_details.columns)
print(address_status.columns)

Index(['customer_id', 'first_name', 'last_name', 'email'], dtype='object')
Index(['customer_id', 'address_id', 'status_id'], dtype='object')
Index(['address_id', 'street_number', 'street_name', 'city', 'country_id'], dtype='object')
Index(['status_id', 'address_status'], dtype='object')


In [50]:
customer_address_id.rename(columns={"status_id" : 'address_status_id'},inplace=True)
address_status.rename(columns={'status_id' : 'address_status_id'},inplace=True)

In [51]:
# MErging customer with customer address ID table

customer_data = pd.merge(customer,customer_address_id, how='left',on='customer_id')
customer_data = pd.merge(customer_data,customer_address_details,how='left',on='address_id')
customer_data = pd.merge(customer_data,address_status,how='left',on='address_status_id')
customer_data

Unnamed: 0,customer_id,first_name,last_name,email,address_id,address_status_id,street_number,street_name,city,country_id,address_status
0,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active
1,1,ursola,purdy,upurdy0@cdbaby.com,962,1,6,school road,timrå,194,active
2,2,ruthanne,vatini,rvatini1@fema.gov,77,1,41492,bartillon circle,klau,92,active
3,2,ruthanne,vatini,rvatini1@fema.gov,708,1,7,thompson point,ylämaa,69,active
4,3,reidar,turbitt,rturbitt2@geocities.jp,39,1,7,bellgrove hill,sumberejo,92,active
...,...,...,...,...,...,...,...,...,...,...,...
3345,1998,georgeanna,garman,ggarmanrp@surveymonkey.com,454,1,725,debs court,radostowice,163,active
3346,1999,ardeen,caret,acaretrq@wsj.com,636,1,733,dovetail place,suchań,163,active
3347,2000,delora,bigglestone,dbigglestonerr@usatoday.com,99,1,53851,meadow valley drive,zvezdara,186,active
3348,2000,delora,bigglestone,dbigglestonerr@usatoday.com,521,1,74655,crownhardt road,al qarmadah,207,active


In [52]:
print(f"Shape of merged customer data : {customer_data.shape}")


Shape of merged customer data : (3350, 11)


In [53]:
customer_data.isnull().sum()

customer_id          0
first_name           0
last_name            0
email                0
address_id           0
address_status_id    0
street_number        0
street_name          0
city                 0
country_id           0
address_status       0
dtype: int64

In [54]:
customer_data.dtypes

customer_id           int64
first_name           object
last_name            object
email                object
address_id            int64
address_status_id     int64
street_number         int64
street_name          object
city                 object
country_id            int64
address_status       object
dtype: object

In [55]:
#Cheking the shape of order related tables :

print(f"shape of cust_order : {cust_order.shape}")
print(f"shape of shipping_method : {shipping_method.shape}")
print(f"shape of order_history : {order_history.shape}")
print(f"shape of order_status : {order_status.shape}")
print(f"shape of order_line : {order_line.shape}")

shape of cust_order : (7550, 9)
shape of shipping_method : (4, 3)
shape of order_history : (22349, 7)
shape of order_status : (6, 2)
shape of order_line : (15400, 4)


In [56]:
# Renaming the column of came data which has different column names

shipping_method.rename(columns={'method_id' : 'shipping_method_id'},inplace=True)

print(shipping_method.columns)

Index(['shipping_method_id', 'method_name', 'cost'], dtype='object')


In [57]:
shipping_method.dtypes

shipping_method_id      int64
method_name            object
cost                  float64
dtype: object

In [58]:
cust_order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7550 entries, 0 to 7549
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            7550 non-null   int64         
 1   order_date          7550 non-null   datetime64[ns]
 2   customer_id         7550 non-null   int64         
 3   shipping_method_id  7550 non-null   int64         
 4   dest_address_id     7550 non-null   int64         
 5   order_day           7550 non-null   int32         
 6   order_month         7550 non-null   int32         
 7   order_year          7550 non-null   int32         
 8   order_month_name    7550 non-null   object        
dtypes: datetime64[ns](1), int32(3), int64(4), object(1)
memory usage: 442.5+ KB


In [59]:
cust_order.shape

(7550, 9)

In [60]:
shipping_method.shape

(4, 3)

In [61]:
# Merging order related data into single table:

order_data = pd.merge(cust_order,shipping_method,how='left',on='shipping_method_id')
order_data['cost'] = pd.to_numeric(order_data['cost'], errors='coerce')

order_data.dtypes

order_id                       int64
order_date            datetime64[ns]
customer_id                    int64
shipping_method_id             int64
dest_address_id                int64
order_day                      int32
order_month                    int32
order_year                     int32
order_month_name              object
method_name                   object
cost                         float64
dtype: object

In [62]:
#cheking shape after merging:
order_data.shape

(7550, 11)

In [63]:
#checking shape before merging:
print(order_history.columns,order_history.shape,sep='\n')

Index(['history_id', 'order_id', 'status_id', 'status_date', 'hist_status_day',
       'hist_status_month', 'hist_status_year'],
      dtype='object')
(22349, 7)


In [64]:
order_data = pd.merge(order_data,order_history,how='left',on='order_id')
order_data.dtypes

order_id                       int64
order_date            datetime64[ns]
customer_id                    int64
shipping_method_id             int64
dest_address_id                int64
order_day                      int32
order_month                    int32
order_year                     int32
order_month_name              object
method_name                   object
cost                         float64
history_id                   float64
status_id                    float64
status_date           datetime64[ns]
hist_status_day              float64
hist_status_month            float64
hist_status_year             float64
dtype: object

In [65]:
#checking shape after merging:
print(f"Shape after merging : {order_data.shape}")

Shape after merging : (22350, 17)


In [66]:
order_data.isnull().sum()

order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
order_day             0
order_month           0
order_year            0
order_month_name      0
method_name           0
cost                  0
history_id            1
status_id             1
status_date           1
hist_status_day       1
hist_status_month     1
hist_status_year      1
dtype: int64

In [67]:
order_data[order_data.isnull().any(axis=1)]

Unnamed: 0,order_id,order_date,customer_id,shipping_method_id,dest_address_id,order_day,order_month,order_year,order_month_name,method_name,cost,history_id,status_id,status_date,hist_status_day,hist_status_month,hist_status_year
3338,1120,2024-12-11 02:42:36,729,1,327,11,12,2024,december,standard,5.9,,,NaT,,,


In [68]:
# Converting dtypes to int for int columns:

order_data['history_id'] = order_data['history_id'].fillna(0).astype(int)
order_data['status_id'] = order_data['status_id'].fillna(0).astype(int)
order_data['hist_status_day'] = order_data['hist_status_day'].fillna(0).astype(int)
order_data['hist_status_month'] = order_data['hist_status_month'].fillna(0).astype(int)
order_data['hist_status_year'] = order_data['hist_status_year'].fillna(0).astype(int)
order_data.dtypes

order_id                       int64
order_date            datetime64[ns]
customer_id                    int64
shipping_method_id             int64
dest_address_id                int64
order_day                      int32
order_month                    int32
order_year                     int32
order_month_name              object
method_name                   object
cost                         float64
history_id                     int64
status_id                      int64
status_date           datetime64[ns]
hist_status_day                int64
hist_status_month              int64
hist_status_year               int64
dtype: object

In [69]:
order_data_1 = order_data.copy()

In [70]:
order_status.columns

Index(['status_id', 'status_value'], dtype='object')

In [71]:
order_data_1.columns

Index(['order_id', 'order_date', 'customer_id', 'shipping_method_id',
       'dest_address_id', 'order_day', 'order_month', 'order_year',
       'order_month_name', 'method_name', 'cost', 'history_id', 'status_id',
       'status_date', 'hist_status_day', 'hist_status_month',
       'hist_status_year'],
      dtype='object')

In [72]:
order_status.rename(columns={'status_id' : 'order_status_id'},inplace=True)

In [73]:
order_status['order_status_id'].unique()

array([1, 2, 3, 4, 5, 6])

In [74]:
order_data_1['status_id'].unique()

array([1, 2, 3, 4, 5, 6, 0])

In [75]:
order_status.shape,order_data_1.shape

((6, 2), (22350, 17))

In [76]:
if order_data_1['status_id'].equals(order_status['order_status_id']):
    print("Both columns are identical in content and order.")
else:
    print("The columns are not identical in content or order.")

The columns are not identical in content or order.


This is because we have filled the missing values with 0 so both the column unique values are not the same. SO merging will not happen properly.

In [77]:
# Remove the extra value from order_data
order_data_2 = order_data_1[order_data_1['status_id'] != 0].copy()

# Re-check for equality
if order_data_2['status_id'].isin(order_status['order_status_id']).all():
    print("The columns now match for valid values.")
else:
    print("Some valid values still do not match.")


The columns now match for valid values.


In [80]:
# Check for mismatched or extra values
extra_in_order_data = set(order_data_2['status_id']) - set(order_status['order_status_id'])
extra_in_order_status = set(order_status['order_status_id']) - set(order_data_2['status_id'])

print(f"Extra values in order_data['status_id']: {extra_in_order_data}")
print(f"Extra values in order_status['order_status_id']: {extra_in_order_status}")


Extra values in order_data['status_id']: set()
Extra values in order_status['order_status_id']: set()


In [81]:
order_data_2['status_id'].unique()

array([1, 2, 3, 4, 5, 6])

In [82]:
# Filter order_data to keep only matching status_id values
order_data_3 = order_data_2[order_data_2['status_id'].isin(order_status['order_status_id'])]
order_data_3.shape

(22349, 17)

In [83]:
order_data_4 = pd.merge(order_data_3, order_status, left_on='status_id',right_on='order_status_id', how='left')
order_data_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22349 entries, 0 to 22348
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            22349 non-null  int64         
 1   order_date          22349 non-null  datetime64[ns]
 2   customer_id         22349 non-null  int64         
 3   shipping_method_id  22349 non-null  int64         
 4   dest_address_id     22349 non-null  int64         
 5   order_day           22349 non-null  int32         
 6   order_month         22349 non-null  int32         
 7   order_year          22349 non-null  int32         
 8   order_month_name    22349 non-null  object        
 9   method_name         22349 non-null  object        
 10  cost                22349 non-null  float64       
 11  history_id          22349 non-null  int64         
 12  status_id           22349 non-null  int64         
 13  status_date         22349 non-null  datetime64

In [84]:
# checking shape

order_data_4.shape,order_line.shape

((22349, 19), (15400, 4))

In [193]:
order_line.columns

Index(['line_id', 'order_id', 'book_id', 'price'], dtype='object')

In [196]:
# Checking for null values before merging:

order_data_4.isnull().sum()

order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
order_day             0
order_month           0
order_year            0
order_month_name      0
method_name           0
cost                  0
history_id            0
status_id             0
status_date           0
hist_status_day       0
hist_status_month     0
hist_status_year      0
order_status_id       0
status_value          0
dtype: int64

In [195]:
# checking for null before merging:

order_line['order_id'].isnull().sum()

np.int64(0)

In [197]:
order_data_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22349 entries, 0 to 22348
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            22349 non-null  int64         
 1   order_date          22349 non-null  datetime64[ns]
 2   customer_id         22349 non-null  int64         
 3   shipping_method_id  22349 non-null  int64         
 4   dest_address_id     22349 non-null  int64         
 5   order_day           22349 non-null  int32         
 6   order_month         22349 non-null  int32         
 7   order_year          22349 non-null  int32         
 8   order_month_name    22349 non-null  object        
 9   method_name         22349 non-null  object        
 10  cost                22349 non-null  float64       
 11  history_id          22349 non-null  int64         
 12  status_id           22349 non-null  int64         
 13  status_date         22349 non-null  datetime64

In [85]:
# Merging the order order 4 with order line:

order_data_5 = pd.merge(order_data_4, order_line, on='order_id', how='left')

order_data_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45615 entries, 0 to 45614
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            45615 non-null  int64         
 1   order_date          45615 non-null  datetime64[ns]
 2   customer_id         45615 non-null  int64         
 3   shipping_method_id  45615 non-null  int64         
 4   dest_address_id     45615 non-null  int64         
 5   order_day           45615 non-null  int32         
 6   order_month         45615 non-null  int32         
 7   order_year          45615 non-null  int32         
 8   order_month_name    45615 non-null  object        
 9   method_name         45615 non-null  object        
 10  cost                45615 non-null  float64       
 11  history_id          45615 non-null  int64         
 12  status_id           45615 non-null  int64         
 13  status_date         45615 non-null  datetime64

In [86]:
# checking shape after merging:

print(f"Shape of order_data_5 table : {order_data_5.shape}")

Shape of order_data_5 table : (45615, 22)


In [None]:
#print(f"Shape of Order_status table : {order_status.shape}\nShape of order_data_5 table : {order_data_5.shape}")

In [87]:
# defining function to check null, shape and info:
def basic_check(df,name):
    print("-"*50)
    print(f"Shape for {name} : {df.shape}")
    print("-"*50)
    print(f"Column name for {name} : {df.columns}")
    print("-"*50)


In [237]:
# Checking for shape and columns

basic_check(order_data_5,"order_data_5")

--------------------------------------------------
Shape for order_data_5 : (45615, 22)
--------------------------------------------------
Column name for order_data_5 : Index(['order_id', 'order_date', 'customer_id', 'shipping_method_id',
       'dest_address_id', 'order_day', 'order_month', 'order_year',
       'order_month_name', 'method_name', 'cost', 'history_id', 'status_id',
       'status_date', 'hist_status_day', 'hist_status_month',
       'hist_status_year', 'order_status_id', 'status_value', 'line_id',
       'book_id', 'price'],
      dtype='object')
--------------------------------------------------
unique of order_data_5 : order_id               7549
order_date             7547
customer_id            1701
shipping_method_id        4
dest_address_id         726
order_day                31
order_month              12
order_year                4
order_month_name         12
method_name               4
cost                      4
history_id            22349
status_id         

In [232]:
order_data_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45615 entries, 0 to 45614
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            45615 non-null  int64         
 1   order_date          45615 non-null  datetime64[ns]
 2   customer_id         45615 non-null  int64         
 3   shipping_method_id  45615 non-null  int64         
 4   dest_address_id     45615 non-null  int64         
 5   order_day           45615 non-null  int32         
 6   order_month         45615 non-null  int32         
 7   order_year          45615 non-null  int32         
 8   order_month_name    45615 non-null  object        
 9   method_name         45615 non-null  object        
 10  cost                45615 non-null  float64       
 11  history_id          45615 non-null  int64         
 12  status_id           45615 non-null  int64         
 13  status_date         45615 non-null  datetime64

In [233]:
# checking for null values after merging:

order_data_5.isnull().sum()

order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
order_day             0
order_month           0
order_year            0
order_month_name      0
method_name           0
cost                  0
history_id            0
status_id             0
status_date           0
hist_status_day       0
hist_status_month     0
hist_status_year      0
order_status_id       0
status_value          0
line_id               0
book_id               0
price                 0
dtype: int64

customer : ['customer_id', 'first_name', 'last_name', 'email']
--------------------------------------------------
cust_order : ['order_id', 'order_date', 'customer_id', 'shipping_method_id', 'dest_address_id']
--------------------------------------------------
customer_address_id : ['customer_id', 'address_id', 'status_id']
--------------------------------------------------
customer_address_details : ['address_id', 'street_number', 'street_name', 'city', 'country_id']
--------------------------------------------------
address_status : ['status_id', 'address_status']
--------------------------------------------------
order_history : ['history_id', 'order_id', 'status_id', 'status_date']
--------------------------------------------------
order_line : ['line_id', 'order_id', 'book_id', 'price']
--------------------------------------------------
order_status : ['status_id', 'status_value']
--------------------------------------------------
shipping_method : ['method_id', 'method_name', 'cost']
--------------------------------------------------
book : ['book_id', 'title', 'isbn13', 'language_id', 'num_pages', 'publication_date', 'publisher_id']
--------------------------------------------------
book_author_id : ['book_id', 'author_id']
--------------------------------------------------
book_author_name : ['author_id', 'author_name']
--------------------------------------------------
book_language : ['language_id', 'language_code', 'language_name']
--------------------------------------------------
publisher : ['publisher_id', 'publisher_name']
--------------------------------------------------

In [240]:
# basic check:

basic_check(book,"book")
basic_check(book_author_id,"book_author_id")
basic_check(book_author_name,"book_author_name")
basic_check(publisher,"publisher")

--------------------------------------------------
Shape for book : (11127, 10)
--------------------------------------------------
Column name for book : Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year'],
      dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape for book_author_id : (17642, 2)
--------------------------------------------------
Column name for book_author_id : Index(['book_id', 'author_id'], dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape for book_author_name : (9235, 2)
--------------------------------------------------
Column name for book_author_name : Index(['author_id', 'author_name'], dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape f

In [247]:
# checking if column has 0 in the id

print(book[book['book_id'] == 0].count())
print(book_author_id[book_author_id['book_id'] == 0].count())

book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month       0
publish_year        0
dtype: int64
book_id      0
author_id    0
dtype: int64


In [88]:
# Merging book and book author table together:

book_data = pd.merge(book,book_author_id,how='left',on='book_id')
book_data.dtypes

book_id                      int64
title                       object
isbn13                       int64
language_id                  int64
num_pages                    int64
publication_date    datetime64[ns]
publisher_id                 int64
publish_day                  int32
publish_month                int32
publish_year                 int32
author_id                  float64
dtype: object

In [251]:
# checking shape after merging:

basic_check(book_data,"Book_data")

--------------------------------------------------
Shape for Book_data : (18711, 11)
--------------------------------------------------
Column name for Book_data : Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year', 'author_id'],
      dtype='object')
--------------------------------------------------


In [252]:
book_data.isnull().sum()

book_id                0
title                  0
isbn13                 0
language_id            0
num_pages              0
publication_date       0
publisher_id           0
publish_day            0
publish_month          0
publish_year           0
author_id           1069
dtype: int64

In [254]:
book_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18711 entries, 0 to 18710
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   book_id           18711 non-null  int64         
 1   title             18711 non-null  object        
 2   isbn13            18711 non-null  int64         
 3   language_id       18711 non-null  int64         
 4   num_pages         18711 non-null  int64         
 5   publication_date  18711 non-null  datetime64[ns]
 6   publisher_id      18711 non-null  int64         
 7   publish_day       18711 non-null  int32         
 8   publish_month     18711 non-null  int32         
 9   publish_year      18711 non-null  int32         
 10  author_id         17642 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(3), int64(5), object(1)
memory usage: 1.4+ MB


In [89]:
# Filling the missing values of order id with 0

book_data['author_id'] = book_data['author_id'].fillna(0).astype(int)


In [256]:
book_data.isnull().sum()

book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month       0
publish_year        0
author_id           0
dtype: int64

In [257]:
book_data.dtypes

book_id                      int64
title                       object
isbn13                       int64
language_id                  int64
num_pages                    int64
publication_date    datetime64[ns]
publisher_id                 int64
publish_day                  int32
publish_month                int32
publish_year                 int32
author_id                    int64
dtype: object

In [90]:
# Basic check before merging book_data and book_author_name table :

basic_check(book_data,"book_data")

basic_check(book_author_name,"book_author_name")


--------------------------------------------------
Shape for book_data : (18711, 11)
--------------------------------------------------
Column name for book_data : Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year', 'author_id'],
      dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape for book_author_name : (9235, 2)
--------------------------------------------------
Column name for book_author_name : Index(['author_id', 'author_name'], dtype='object')
--------------------------------------------------


In [259]:
book_author_name.isnull().sum()

author_id      0
author_name    0
dtype: int64

In [91]:
# Merging book_data and book_author_name table:

book_data = pd.merge(book_data,book_author_name,how='left',on='author_id')



In [262]:
print(book_data.shape)
print(book_data.info())

(18711, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18711 entries, 0 to 18710
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   book_id           18711 non-null  int64         
 1   title             18711 non-null  object        
 2   isbn13            18711 non-null  int64         
 3   language_id       18711 non-null  int64         
 4   num_pages         18711 non-null  int64         
 5   publication_date  18711 non-null  datetime64[ns]
 6   publisher_id      18711 non-null  int64         
 7   publish_day       18711 non-null  int32         
 8   publish_month     18711 non-null  int32         
 9   publish_year      18711 non-null  int32         
 10  author_id         18711 non-null  int64         
 11  author_name       17642 non-null  object        
dtypes: datetime64[ns](1), int32(3), int64(6), object(2)
memory usage: 1.5+ MB
None


In [263]:
book_data.isnull().sum()

book_id                0
title                  0
isbn13                 0
language_id            0
num_pages              0
publication_date       0
publisher_id           0
publish_day            0
publish_month          0
publish_year           0
author_id              0
author_name         1069
dtype: int64

In [92]:
# Filling missing values in author name:

book_data['author_name'] = book_data['author_name'].fillna("No author")

In [269]:
book_data.isnull().sum()

book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month       0
publish_year        0
author_id           0
author_name         0
dtype: int64

In [271]:
basic_check(book_data,"book_data")
basic_check(book_language,"book_language")

--------------------------------------------------
Shape for book_data : (18711, 12)
--------------------------------------------------
Column name for book_data : Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year', 'author_id', 'author_name'],
      dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape for book_language : (27, 3)
--------------------------------------------------
Column name for book_language : Index(['language_id', 'language_code', 'language_name'], dtype='object')
--------------------------------------------------


In [274]:
book_language.isnull().sum()

language_id      0
language_code    0
language_name    0
dtype: int64

In [93]:
# Merging book_data and language table :

book_data = pd.merge(book_data,book_language,how='left',on='language_id')

print(book_data.shape)
print(book_data.info())


(18711, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18711 entries, 0 to 18710
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   book_id           18711 non-null  int64         
 1   title             18711 non-null  object        
 2   isbn13            18711 non-null  int64         
 3   language_id       18711 non-null  int64         
 4   num_pages         18711 non-null  int64         
 5   publication_date  18711 non-null  datetime64[ns]
 6   publisher_id      18711 non-null  int64         
 7   publish_day       18711 non-null  int32         
 8   publish_month     18711 non-null  int32         
 9   publish_year      18711 non-null  int32         
 10  author_id         18711 non-null  int64         
 11  author_name       18711 non-null  object        
 12  language_code     18711 non-null  object        
 13  language_name     18711 non-null  object        
dtypes: datetim

In [277]:
book_data.isnull().sum()

book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month       0
publish_year        0
author_id           0
author_name         0
language_code       0
language_name       0
dtype: int64

In [278]:
basic_check(book_data,"book_data")
basic_check(publisher,"publisher")

--------------------------------------------------
Shape for book_data : (18711, 14)
--------------------------------------------------
Column name for book_data : Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year', 'author_id', 'author_name', 'language_code',
       'language_name'],
      dtype='object')
--------------------------------------------------
--------------------------------------------------
Shape for publisher : (2264, 2)
--------------------------------------------------
Column name for publisher : Index(['publisher_id', 'publisher_name'], dtype='object')
--------------------------------------------------


In [94]:
book_data = pd.merge(book_data,publisher,how='left',on='publisher_id')
print(book_data.shape)
print(book_data.info())


(18711, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18711 entries, 0 to 18710
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   book_id           18711 non-null  int64         
 1   title             18711 non-null  object        
 2   isbn13            18711 non-null  int64         
 3   language_id       18711 non-null  int64         
 4   num_pages         18711 non-null  int64         
 5   publication_date  18711 non-null  datetime64[ns]
 6   publisher_id      18711 non-null  int64         
 7   publish_day       18711 non-null  int32         
 8   publish_month     18711 non-null  int32         
 9   publish_year      18711 non-null  int32         
 10  author_id         18711 non-null  int64         
 11  author_name       18711 non-null  object        
 12  language_code     18711 non-null  object        
 13  language_name     18711 non-null  object        
 14  publisher_

In [280]:
print(book_data.isnull().sum())


book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month       0
publish_year        0
author_id           0
author_name         0
language_code       0
language_name       0
publisher_name      0
dtype: int64


In [282]:
print(customer_data.columns)
print(order_data_5.columns)
print(book_data.columns)

Index(['customer_id', 'first_name', 'last_name', 'email', 'address_id',
       'address_status_id', 'street_number', 'street_name', 'city',
       'country_id', 'address_status'],
      dtype='object')
Index(['order_id', 'order_date', 'customer_id', 'shipping_method_id',
       'dest_address_id', 'order_day', 'order_month', 'order_year',
       'order_month_name', 'method_name', 'cost', 'history_id', 'status_id',
       'status_date', 'hist_status_day', 'hist_status_month',
       'hist_status_year', 'order_status_id', 'status_value', 'line_id',
       'book_id', 'price'],
      dtype='object')
Index(['book_id', 'title', 'isbn13', 'language_id', 'num_pages',
       'publication_date', 'publisher_id', 'publish_day', 'publish_month',
       'publish_year', 'author_id', 'author_name', 'language_code',
       'language_name', 'publisher_name'],
      dtype='object')


In [283]:
print(customer_data.isnull().sum())
print(order_data_5.isnull().sum())
print(book_data.isnull().sum())

customer_id          0
first_name           0
last_name            0
email                0
address_id           0
address_status_id    0
street_number        0
street_name          0
city                 0
country_id           0
address_status       0
dtype: int64
order_id              0
order_date            0
customer_id           0
shipping_method_id    0
dest_address_id       0
order_day             0
order_month           0
order_year            0
order_month_name      0
method_name           0
cost                  0
history_id            0
status_id             0
status_date           0
hist_status_day       0
hist_status_month     0
hist_status_year      0
order_status_id       0
status_value          0
line_id               0
book_id               0
price                 0
dtype: int64
book_id             0
title               0
isbn13              0
language_id         0
num_pages           0
publication_date    0
publisher_id        0
publish_day         0
publish_month    

In [286]:
print(order_data_5.shape)
print(customer_data.shape)
print(book_data.shape)

(45615, 22)
(3350, 11)
(18711, 15)


In [96]:
# creating a master file by merging customer,order and book table:

master_data = pd.merge(order_data_5, customer_data, on='customer_id', how='left')

print(master_data.shape)
print(master_data.info())


(94748, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94748 entries, 0 to 94747
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            94748 non-null  int64         
 1   order_date          94748 non-null  datetime64[ns]
 2   customer_id         94748 non-null  int64         
 3   shipping_method_id  94748 non-null  int64         
 4   dest_address_id     94748 non-null  int64         
 5   order_day           94748 non-null  int32         
 6   order_month         94748 non-null  int32         
 7   order_year          94748 non-null  int32         
 8   order_month_name    94748 non-null  object        
 9   method_name         94748 non-null  object        
 10  cost                94748 non-null  float64       
 11  history_id          94748 non-null  int64         
 12  status_id           94748 non-null  int64         
 13  status_date         94748 non-null

In [98]:
# creating a master file by merging customer,order and book table:

master_data = pd.merge(customer_data, order_data_5,on='customer_id', how='left')

print(master_data.shape)
print(master_data.info())


(95106, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95106 entries, 0 to 95105
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   customer_id         95106 non-null  int64         
 1   first_name          95106 non-null  object        
 2   last_name           95106 non-null  object        
 3   email               95106 non-null  object        
 4   address_id          95106 non-null  int64         
 5   address_status_id   95106 non-null  int64         
 6   street_number       95106 non-null  int64         
 7   street_name         95106 non-null  object        
 8   city                95106 non-null  object        
 9   country_id          95106 non-null  int64         
 10  address_status      95106 non-null  object        
 11  order_id            94748 non-null  float64       
 12  order_date          94748 non-null  datetime64[ns]
 13  shipping_method_id  94748 non-null

In [99]:
master_data.customer_id.nunique()

2000

In [100]:
master_data.isnull().sum()

customer_id             0
first_name              0
last_name               0
email                   0
address_id              0
address_status_id       0
street_number           0
street_name             0
city                    0
country_id              0
address_status          0
order_id              358
order_date            358
shipping_method_id    358
dest_address_id       358
order_day             358
order_month           358
order_year            358
order_month_name      358
method_name           358
cost                  358
history_id            358
status_id             358
status_date           358
hist_status_day       358
hist_status_month     358
hist_status_year      358
order_status_id       358
status_value          358
line_id               358
book_id               358
price                 358
dtype: int64

In [101]:
# Create flag columns for missing values
master_data['price_missing'] = master_data['price'].isna().astype(int)
master_data['cost_missing'] = master_data['cost'].isna().astype(int)


In [141]:
master_data[master_data.price_missing == 0]

Unnamed: 0,customer_id,first_name,last_name,email,address_id,address_status_id,street_number,street_name,city,country_id,address_status,order_id,order_date,shipping_method_id,dest_address_id,order_day,order_month,order_year,order_month_name,method_name,cost,history_id,status_id,status_date,hist_status_day,hist_status_month,hist_status_year,order_status_id,status_value,line_id,book_id,price,price_missing,cost_missing,title,isbn13,language_id,num_pages,publication_date,publisher_id,publish_day,publish_month,publish_year,author_id,author_name,language_code,language_name,publisher_name
0,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07 20:03:21,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,1,6476,13,0,0,1213,9780712664561,1,640,2004-02-05,1539,5,2,2004,231,alison weir,eng,english,pimlico
1,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07 20:03:21,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,11109,735,3,0,0,1213,9780064403368,1,288,2001-05-08,882,8,5,2001,5112,laurence yep,eng,english,harpercollins
2,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07 20:03:21,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,16232,611,2,0,0,1213,9780061015618,1,384,2003-10-28,160,28,10,2003,8066,shirley rousseau murphy,eng,english,avon
3,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07 20:03:21,4,359,7,2,2023,february,international,24,12860,2,2023-02-08 05:30:06,8,2.0,2023,2,pending delivery,1,6476,13,0,0,1213,9780712664561,1,640,2004-02-05,1539,5,2,2004,231,alison weir,eng,english,pimlico
4,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07 20:03:21,4,359,7,2,2023,february,international,24,12860,2,2023-02-08 05:30:06,8,2.0,2023,2,pending delivery,11109,735,3,0,0,1213,9780064403368,1,288,2001-05-08,882,8,5,2001,5112,laurence yep,eng,english,harpercollins
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160337,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22 13:39:14,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,2951,george saunders,eng,english,penguin books
160338,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22 13:39:14,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,3190,hari kunzru,eng,english,penguin books
160339,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22 13:39:14,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,6461,nick hornby,eng,english,penguin books
160340,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22 13:39:14,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,8675,toby litt,eng,english,penguin books


In [106]:
# Filling missing values:

master_data['order_id'] = master_data['order_id'].fillna(0).astype(int)
master_data['shipping_method_id'] = master_data['shipping_method_id'].fillna(0).astype(int)

master_data['dest_address_id'] = master_data['dest_address_id'].fillna(0).astype(int)
master_data['order_day'] = master_data['order_day'].fillna(0).astype(int)
master_data['order_month'] = master_data['order_month'].fillna(0).astype(int)
master_data['order_year'] = master_data['order_year'].fillna(0).astype(int)
master_data['order_month_name'] = master_data['order_month_name'].fillna('No order')
master_data['order_year'] = master_data['order_year'].fillna(0).astype(int)
master_data['method_name'] = master_data['method_name'].fillna('unknown')
master_data['cost'] = master_data['cost'].fillna(0).astype(int)
master_data['history_id'] = master_data['history_id'].fillna(0).astype(int)
master_data['status_id'] = master_data['status_id'].fillna(0).astype(int)
master_data['hist_status_day'] = master_data['hist_status_day'].fillna(0).astype(int)
master_data['hist_status_month'] = master_data['hist_status_month'].fillna('No History')
master_data['hist_status_year'] = master_data['hist_status_year'].fillna(0).astype(int)
master_data['order_status_id'] = master_data['order_status_id'].fillna(0).astype(int)
master_data['status_value'] = master_data['status_value'].fillna('No order status')
master_data['line_id'] = master_data['line_id'].fillna(0).astype(int)
master_data['book_id'] = master_data['book_id'].fillna(0).astype(int)
master_data['price'] = master_data['price'].fillna(0).astype(int)


In [108]:
# Filling date columns:

# Fill missing dates with a default value
master_data['order_date'] = master_data['order_date'].fillna('1900-01-01')
master_data['status_date'] = master_data['status_date'].fillna('1900-01-01')

In [143]:
master_data.dtypes

customer_id                    int64
first_name                    object
last_name                     object
email                         object
address_id                     int64
address_status_id              int64
street_number                  int64
street_name                   object
city                          object
country_id                     int64
address_status                object
order_id                       int64
order_date            datetime64[ns]
shipping_method_id             int64
dest_address_id                int64
order_day                      int64
order_month                    int64
order_year                     int64
order_month_name              object
method_name                   object
cost                           int64
history_id                     int64
status_id                      int64
status_date           datetime64[ns]
hist_status_day                int64
hist_status_month             object
hist_status_year               int64
o

In [144]:
def check_null(x):
    null_col = []

    for col in x.columns:
        if x[col].isnull().sum() != 0:
            null_col.append(col)
                
    if null_col:
        print(f"Null values found in : {null_col}")
    else:
        print("No null values found") 

In [113]:
check_null(master_data)
        

No null values found


In [145]:
master_data.shape

(160342, 48)

In [116]:
master_data = pd.merge(master_data, book_data, on='book_id', how='left')

print(master_data.shape)
print(master_data.info())

(160342, 48)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160342 entries, 0 to 160341
Data columns (total 48 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   customer_id         160342 non-null  int64         
 1   first_name          160342 non-null  object        
 2   last_name           160342 non-null  object        
 3   email               160342 non-null  object        
 4   address_id          160342 non-null  int64         
 5   address_status_id   160342 non-null  int64         
 6   street_number       160342 non-null  int64         
 7   street_name         160342 non-null  object        
 8   city                160342 non-null  object        
 9   country_id          160342 non-null  int64         
 10  address_status      160342 non-null  object        
 11  order_id            160342 non-null  int64         
 12  order_date          160342 non-null  datetime64[ns]
 13  shipping_method_

In [127]:
check_null(master_data)

Null values found in : ['title', 'isbn13', 'language_id', 'num_pages', 'publication_date', 'publisher_id', 'publish_day', 'publish_month', 'publish_year', 'author_id', 'author_name', 'language_code', 'language_name', 'publisher_name']


In [133]:
# Filling missing values:

master_data['title'] = master_data['order_id'].fillna('No title')
master_data['isbn13'] = master_data['isbn13'].fillna(0).astype(int)

master_data['language_id'] = master_data['language_id'].fillna(0).astype(int)
master_data['num_pages'] = master_data['num_pages'].fillna(0).astype(int)
master_data['publisher_id'] = master_data['publisher_id'].fillna(0).astype(int)
master_data['publish_day'] = master_data['publish_day'].fillna(0).astype(int)
master_data['publish_month'] = master_data['publish_month'].fillna(0).astype(int)
master_data['publish_year'] = master_data['publish_year'].fillna(0).astype(int)
master_data['author_id'] = master_data['author_id'].fillna(0).astype(int)
master_data['author_name'] = master_data['author_name'].fillna('No Name')
master_data['language_code'] = master_data['language_code'].fillna('No code')
master_data['language_name'] = master_data['language_name'].fillna('No Language')
master_data['publisher_name'] = master_data['publisher_name'].fillna('No Pub name')



In [136]:
# FIlling date columns:

master_data['publication_date'] = master_data['publication_date'].fillna('1900-01-01')

In [137]:
check_null(master_data)

No null values found


In [138]:
pd.set_option('display.max_columns',None)

In [139]:
master_data.sample(5)

Unnamed: 0,customer_id,first_name,last_name,email,address_id,address_status_id,street_number,street_name,city,country_id,address_status,order_id,order_date,shipping_method_id,dest_address_id,order_day,order_month,order_year,order_month_name,method_name,cost,history_id,status_id,status_date,hist_status_day,hist_status_month,hist_status_year,order_status_id,status_value,line_id,book_id,price,price_missing,cost_missing,title,isbn13,language_id,num_pages,publication_date,publisher_id,publish_day,publish_month,publish_year,author_id,author_name,language_code,language_name,publisher_name
96369,1193,cassaundra,passey,cpassey5c@csmonitor.com,640,1,267,lien park,bečej,186,active,4251,2023-08-03 23:58:03,3,47,3,8,2023,august,express,11,24714,5,2023-08-05 00:19:19,5,8.0,2023,5,cancelled,4496,598,0,0,0,4251,9780060977085,2,352,2002-04-02,873,2,4,2002,7331,richard powers,en-us,united states english,harper perennial
10290,108,orv,strover,ostrover2z@yelp.com,85,1,217,5th street,panjiang,42,active,2069,2022-02-14 04:56:06,2,627,14,2,2022,february,priority,8,2068,1,2022-02-14 09:50:11,14,2.0,2022,1,order received,445,6481,1,0,0,2069,9780713912548,5,467,1979-06-01,66,1,6,1979,0,No author,en-gb,british english,allen lane
49436,616,culver,seys,cseysh3@tiny.cc,693,1,7735,claremont point,comrat,129,active,6901,2024-10-18 03:56:40,3,230,18,10,2024,october,express,11,21996,4,2024-10-26 11:16:14,26,10.0,2024,4,delivered,13712,8799,7,0,0,6901,9781400077090,1,272,2005-07-12,86,12,7,2005,175,alexander mccall smith,eng,english,anchor
142359,1806,ellsworth,philpots,ephilpotsmd@guardian.co.uk,529,1,44304,fair oaks pass,telhado,164,active,9268,2024-03-24 15:59:11,4,17,24,3,2024,march,international,24,6155,1,2024-03-25 03:31:14,25,3.0,2024,1,order received,11519,6631,13,0,0,9268,9780743223850,1,368,2002-06-04,740,4,6,2002,4430,john s.d. eisenhower,eng,english,free press
123389,1559,mose,pover,mpoverfi@ifeng.com,440,1,3964,hazelcrest pass,lansing,217,active,1473,2022-12-01 01:00:43,2,440,1,12,2022,december,priority,8,1472,1,2022-12-01 01:20:11,1,12.0,2022,1,order received,5883,6167,18,0,0,1473,9780679766742,1,688,1997-11-11,2113,11,11,1997,213,alice munro,eng,english,vintage


In [146]:
master_data.dtypes

customer_id                    int64
first_name                    object
last_name                     object
email                         object
address_id                     int64
address_status_id              int64
street_number                  int64
street_name                   object
city                          object
country_id                     int64
address_status                object
order_id                       int64
order_date            datetime64[ns]
shipping_method_id             int64
dest_address_id                int64
order_day                      int64
order_month                    int64
order_year                     int64
order_month_name              object
method_name                   object
cost                           int64
history_id                     int64
status_id                      int64
status_date           datetime64[ns]
hist_status_day                int64
hist_status_month             object
hist_status_year               int64
o

In [147]:
#Converting object column to cat column

master_data['method_name'] = master_data['method_name'].astype('category')
master_data['status_value'] = master_data['status_value'].astype('category')
master_data['address_status'] = master_data['address_status'].astype('category')
master_data['language_name'] = master_data['language_name'].astype('category')
master_data['publisher_name'] = master_data['publisher_name'].astype('category')
master_data['language_code'] = master_data['language_code'].astype('category')

**Feature engineering**

In [148]:
# Adding column has order or not

master_data_1 = master_data.copy()


In [149]:
master_data_1.columns

Index(['customer_id', 'first_name', 'last_name', 'email', 'address_id',
       'address_status_id', 'street_number', 'street_name', 'city',
       'country_id', 'address_status', 'order_id', 'order_date',
       'shipping_method_id', 'dest_address_id', 'order_day', 'order_month',
       'order_year', 'order_month_name', 'method_name', 'cost', 'history_id',
       'status_id', 'status_date', 'hist_status_day', 'hist_status_month',
       'hist_status_year', 'order_status_id', 'status_value', 'line_id',
       'book_id', 'price', 'price_missing', 'cost_missing', 'title', 'isbn13',
       'language_id', 'num_pages', 'publication_date', 'publisher_id',
       'publish_day', 'publish_month', 'publish_year', 'author_id',
       'author_name', 'language_code', 'language_name', 'publisher_name'],
      dtype='object')

In [151]:
master_data_1['order_date']

0        2023-02-07 20:03:21
1        2023-02-07 20:03:21
2        2023-02-07 20:03:21
3        2023-02-07 20:03:21
4        2023-02-07 20:03:21
                 ...        
160337   2024-05-22 13:39:14
160338   2024-05-22 13:39:14
160339   2024-05-22 13:39:14
160340   2024-05-22 13:39:14
160341   2024-05-22 13:39:14
Name: order_date, Length: 160342, dtype: datetime64[ns]

In [152]:
master_data_1['order_date'] = master_data_1['order_date'].dt.date
master_data_1['order_date']

0         2023-02-07
1         2023-02-07
2         2023-02-07
3         2023-02-07
4         2023-02-07
             ...    
160337    2024-05-22
160338    2024-05-22
160339    2024-05-22
160340    2024-05-22
160341    2024-05-22
Name: order_date, Length: 160342, dtype: object

In [325]:
from datetime import datetime

datetime.now().date()

datetime.date(2024, 12, 22)

In [153]:
# Adding column to find the last order date.


master_data_1['last_order_date'] = master_data_1.groupby('customer_id')['order_date'].transform('max')



In [154]:
#COnverting to date time type

master_data_1['last_order_date'] = pd.to_datetime(master_data_1['last_order_date'])
master_data_1['last_order_date'].dtype

dtype('<M8[ns]')

In [155]:
# Adding column to find number of days from last order:

from datetime import datetime

today_date =pd.to_datetime(datetime.now().date())

master_data_1['recency'] = (today_date - master_data_1['last_order_date']).dt.days

master_data_1['recency'].dtype

dtype('int64')

In [156]:
master_data_1[master_data_1['price_missing'] == 1]

Unnamed: 0,customer_id,first_name,last_name,email,address_id,address_status_id,street_number,street_name,city,country_id,address_status,order_id,order_date,shipping_method_id,dest_address_id,order_day,order_month,order_year,order_month_name,method_name,cost,history_id,status_id,status_date,hist_status_day,hist_status_month,hist_status_year,order_status_id,status_value,line_id,book_id,price,price_missing,cost_missing,title,isbn13,language_id,num_pages,publication_date,publisher_id,publish_day,publish_month,publish_year,author_id,author_name,language_code,language_name,publisher_name,last_order_date,recency
717,9,phebe,curdell,pcurdell8@usa.gov,917,1,3,dorton place,prnjavor,186,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
1000,12,filmer,douse,fdouseb@foxnews.com,833,1,614,duke street,stavanger,151,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
1251,16,debbi,huyghe,dhuyghef@dot.gov,984,1,9,vahlen way,qilin,42,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
2240,24,raul,pentelow,rpentelown@zimbio.com,905,1,91,lighthouse bay parkway,yunxi,42,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
2435,28,rob,handes,rhandesr@arstechnica.com,945,1,5,little fleur park,new glasgow,37,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158239,1973,horatia,haig,hhaigr0@globo.com,902,1,1275,american ash way,bojawa,92,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
158240,1973,horatia,haig,hhaigr0@globo.com,962,1,6,school road,timrå,194,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
158259,1975,sherman,wenger,swengerr2@rakuten.co.jp,907,1,84,forster avenue,kaskinen,69,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647
158890,1984,trudey,itzhaki,titzhakirb@hao123.com,1000,1,503,canary crossing,jiangfeng,42,active,0,1900-01-01,0,0,0,0,0,No order,unknown,0,0,0,1900-01-01,0,No History,0,0,No order status,0,0,0,1,1,0,0,0,0,1900-01-01,0,0,0,0,0,No Name,No code,No Language,No Pub name,1900-01-01,45647


In [157]:
# adding column to find how much each customer has spent totally:

master_data_1['spending_of_customer'] = master_data_1.groupby('order_id')['price'].transform('sum')

In [158]:
today_date =datetime.now().date()
today_date

datetime.date(2024, 12, 23)

In [391]:
churn_threshold = (today_date - pd.DateOffset(months=3)).date()
churn_threshold

datetime.date(2024, 9, 22)

In [159]:
master_data_1['total_no_of_order'] = master_data_1.groupby('customer_id')['order_id'].transform('count')

In [164]:
import numpy as np

# Add a new column 'has_order' with 1 if order_id is not 0, else 0
master_data_1['has_order'] = np.where(master_data_1['order_id'] == 0, 0, 1)

In [165]:
master_data_1.duplicated().sum()

np.int64(0)

In [166]:
master_data_1.columns

Index(['customer_id', 'first_name', 'last_name', 'email', 'address_id',
       'address_status_id', 'street_number', 'street_name', 'city',
       'country_id', 'address_status', 'order_id', 'order_date',
       'shipping_method_id', 'dest_address_id', 'order_day', 'order_month',
       'order_year', 'order_month_name', 'method_name', 'cost', 'history_id',
       'status_id', 'status_date', 'hist_status_day', 'hist_status_month',
       'hist_status_year', 'order_status_id', 'status_value', 'line_id',
       'book_id', 'price', 'price_missing', 'cost_missing', 'title', 'isbn13',
       'language_id', 'num_pages', 'publication_date', 'publisher_id',
       'publish_day', 'publish_month', 'publish_year', 'author_id',
       'author_name', 'language_code', 'language_name', 'publisher_name',
       'last_order_date', 'recency', 'spending_of_customer',
       'total_no_of_order', 'has_order'],
      dtype='object')

In [177]:
from datetime import datetime
import pandas as pd

# Set today's date and the churn threshold
today_date = pd.to_datetime(datetime.now().date())
churn_threshold = today_date - pd.DateOffset(months=3)

# Function to calculate churn based on multiple criteria
def calculate_churn(row):
    # Check if order_id is missing or empty
    if pd.isna(row['order_id']) or row['order_id'] == 0:
        return 1  # Churned if no order_id
    
    # Check if last_order_date is more than 3 months ago
    if pd.to_datetime(row['last_order_date']) < churn_threshold:
        return 1  # Churned if last order was more than 3 months ago
    
    # Additional business logic can be added here (e.g., spending threshold, order count)
    # Example: if the spending in the last 3 months is below a certain threshold, consider churn
    if row['spending_of_customer'] < 20:  # Example threshold for spending
        return 1  # Churned if low spending
    
    # If none of the conditions are met, the customer is not churned
    return 0

# Apply the churn function to each row
master_data_1['Churn'] = master_data_1.apply(calculate_churn, axis=1)


In [178]:
master_data_1.Churn.value_counts()

Churn
1    95129
0    65213
Name: count, dtype: int64

In [409]:
master_data_1['Churn'].value_counts()

Churn
1    92992
0    66992
Name: count, dtype: int64

Churn data seems to be imbalanced data. Need to resample

In [179]:
import os

path =  os.path.join(os.getcwd(),"data",'Merged_data')
file  = "master_data.csv"

os.makedirs(path,exist_ok=True)

folder = os.path.join(path,file)


master_data_1.to_csv(folder,index=False)

In [180]:
pd.read_csv(r'D:\DS_Final_Project\Notebook\data\Merged_data\master_data.csv')

Unnamed: 0,customer_id,first_name,last_name,email,address_id,address_status_id,street_number,street_name,city,country_id,address_status,order_id,order_date,shipping_method_id,dest_address_id,order_day,order_month,order_year,order_month_name,method_name,cost,history_id,status_id,status_date,hist_status_day,hist_status_month,hist_status_year,order_status_id,status_value,line_id,book_id,price,price_missing,cost_missing,title,isbn13,language_id,num_pages,publication_date,publisher_id,publish_day,publish_month,publish_year,author_id,author_name,language_code,language_name,publisher_name,last_order_date,recency,spending_of_customer,total_no_of_order,has_order,Churn
0,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,1,6476,13,0,0,1213,9780712664561,1,640,2004-02-05,1539,5,2,2004,231,alison weir,eng,english,pimlico,2023-02-07,685,180,46,1,1
1,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,11109,735,3,0,0,1213,9780064403368,1,288,2001-05-08,882,8,5,2001,5112,laurence yep,eng,english,harpercollins,2023-02-07,685,180,46,1,1
2,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07,4,359,7,2,2023,february,international,24,1212,1,2023-02-07 21:20:42,7,2.0,2023,1,order received,16232,611,2,0,0,1213,9780061015618,1,384,2003-10-28,160,28,10,2003,8066,shirley rousseau murphy,eng,english,avon,2023-02-07,685,180,46,1,1
3,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07,4,359,7,2,2023,february,international,24,12860,2,2023-02-08 05:30:06,8,2.0,2023,2,pending delivery,1,6476,13,0,0,1213,9780712664561,1,640,2004-02-05,1539,5,2,2004,231,alison weir,eng,english,pimlico,2023-02-07,685,180,46,1,1
4,1,ursola,purdy,upurdy0@cdbaby.com,359,1,9923,merrick center,kiuruvesi,69,active,1213,2023-02-07,4,359,7,2,2023,february,international,24,12860,2,2023-02-08 05:30:06,8,2.0,2023,2,pending delivery,11109,735,3,0,0,1213,9780064403368,1,288,2001-05-08,882,8,5,2001,5112,laurence yep,eng,english,harpercollins,2023-02-07,685,180,46,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160337,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,2951,george saunders,eng,english,penguin books,2024-10-31,53,1539,366,1,0
160338,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,3190,hari kunzru,eng,english,penguin books,2024-10-31,53,1539,366,1,0
160339,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,6461,nick hornby,eng,english,penguin books,2024-10-31,53,1539,366,1,0
160340,2000,delora,bigglestone,dbigglestonerr@usatoday.com,693,1,7735,claremont point,comrat,129,active,10555,2024-05-22,4,99,22,5,2024,may,international,24,18029,3,2024-05-23 07:07:35,23,5.0,2024,3,delivery in progress,7550,1577,19,0,0,10555,9780143038184,1,304,2008-01-02,1476,2,1,2008,8675,toby litt,eng,english,penguin books,2024-10-31,53,1539,366,1,0
