# Data Description

This is the one of its kinds book sales dataset from Pakistan. It contains 200,000 book orders from January 2019 to January 2021. The data was collected from the merchant (Gufhtugu Publications www.Gufhtugu.com) who are partner in this research study. 

**Please upvote if you find this notebook helpful! 😊 Thank you! I would also be very happy to receive feedback on my work.**

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# plotting stuff
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
colorMap = sns.light_palette("blue", as_cmap=True)
import datatable as dt
# misc
import missingno as msno
# system
import warnings
warnings.filterwarnings('ignore')
# garbage collector to keep RAM in check
import gc  
import matplotlib.gridspec as gridspec

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Data Overview

In [None]:
!wc -l ../input/gufhtugu-publications-dataset-challenge/*.csv

### There are three data set GP Order 2, 4, 5. I am going to use Update(GP Orders - 5.csv) dataset which have additional fields.

In [None]:
df_data = dt.fread('/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv').to_pandas()
df_data.head(10)

In [None]:
df_data.info()

In [None]:
# Number of rows and columns in the Data set
df_data.shape

In [None]:
df_data['Time'] = df_data['Order Date & Time'].apply(lambda x: str(x).split(' ')[1])
df_data['Date'] = df_data['Order Date & Time'].apply(lambda x: str(x).split(' ')[0])

# Null Values Handling

In [None]:
# check the rows have null values in dataset
df_data[pd.isnull(df_data).any(axis=1)]

In [None]:
print("Number of features with null values:",np.sum(df_data.isna().sum()>0))

In [None]:
df_data[df_data.duplicated(['Order Number'])]

### There is no dublicate values

# Order Status

In [None]:
df_data['Order Status'].unique()

In [None]:
print(df_data['Order Status'].value_counts())

In [None]:
px.histogram(df_data, x = 'Order Status', width = 600, height = 400, title = 'Frequency of Order Status')

# Payment Method

In [None]:
df_data["Payment Method"] = df_data["Payment Method"].replace('Cash on Delivery (COD)', 'Cash on delivery')
df_data["Payment Method"].value_counts()

In [None]:
px.histogram(df_data, x = 'Payment Method', width = 800, height = 500, title = 'Frequency of Payment Method')

# Order Status vs Payment Method

In [None]:
fig = px.histogram(df_data, x="Order Status", color="Payment Method")
fig.show()

# Book Name

In [None]:
bookdata = df_data['Book Name'].value_counts().rename_axis('Book_Name').reset_index(name='counts')
bookdata.head(10)

In [None]:
px.histogram(bookdata.head(10), x = 'Book_Name',y = 'counts', width = 800, height = 500,  title = 'Top selling books')

In [None]:
bookdata_without_combination = bookdata[bookdata["Book_Name"].str.contains("/") == False]
bookdata_without_combination.tail()

In [None]:
px.histogram(bookdata_without_combination.tail(10), x = 'Book_Name',y = 'counts', width = 800, height = 500,  title = 'Low selling books')

In [None]:
bookdata_with_combination = bookdata[bookdata["Book_Name"].str.contains("/") == True]
bookdata_with_combination.tail()

In [None]:
px.histogram(bookdata_with_combination.head(10), x = 'Book_Name',y = 'counts', width = 800, height = 500,  title = 'Top selling books with combination')

# City

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['City'].value_counts().rename_axis('City').reset_index(name='counts')
order=completeorder.head(10)


x=order["City"]
y=order["counts"]
fig = go.Figure(data=[go.Bar(x=x,
                             y=y,
                             text=y,
            textposition='auto',
        )])
fig.update_layout(title_text='Top Selling City with Order status Completed with Counts',
                   xaxis_title_text='City', 
    yaxis_title_text='Count')
fig.show()

In [None]:
returnorder=df_data[df_data['Order Status']=="Returned"]['City'].value_counts().rename_axis('City').reset_index(name='counts')
order1=returnorder.head(10)
x=order1["City"]
y=order1["counts"]
fig = go.Figure(data=[go.Bar(x=x,
                             y=y,
                             text=y,
            textposition='auto',
        )])
fig.update_layout(title_text='Top Selling City with Order status Returned with Counts',
                 xaxis_title_text='City', 
    yaxis_title_text='Count' )
fig.show()

In [None]:
Canceledorder=df_data[df_data['Order Status']=="Cancelled"]['City'].value_counts().rename_axis('City').reset_index(name='counts')
order2=Canceledorder.head(10)


x=order2["City"]
y=order2["counts"]
fig = go.Figure(data=[go.Bar(x=x,
                             y=y,
                             text=y,
            textposition='auto',
        )])
fig.update_layout(title_text='Top Selling City with Order status Canceled with Counts',
                   xaxis_title_text='City', 
    yaxis_title_text='Count')
fig.show()

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['Book Name'].value_counts().rename_axis('Book Name').reset_index(name='counts')
order=completeorder.head(10)


x=order["Book Name"]
y=order["counts"]
fig = go.Figure(data=[go.Bar(x=x,
                             y=y,
                             text=y,
            textposition='auto',
        )])
fig.update_layout(title_text='Top Selling Book with Order status Completed with Counts',
                   xaxis_title_text='Book', 
    yaxis_title_text='Count')
fig.show()

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['Date'].value_counts().rename_axis('Date').reset_index(name='counts')
order=completeorder.head(15)


x=order["Date"]
y=order["counts"]
fig = go.Figure(data=[go.Bar(x=x,
                             y=y,
                             text=y,
            textposition='auto',
        )])
fig.update_layout(title_text='Top Selling Book with Order status Completed with Respect to date',
                   xaxis_title_text='Date', 
    yaxis_title_text='Count')
fig.show()

In [None]:
fig = px.histogram(df_data, x="Order Status", color="Date")
fig.update_layout(title_text='Order status with Respect to date',
                   xaxis_title_text='Order Status', 
    yaxis_title_text='Count')
fig.show()

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['City'].value_counts().rename_axis('City').reset_index(name='counts')
order=completeorder.head(10)
top_cities = pd.Series(order.City)
df_data[df_data['City'].isin(top_cities)]
fig = px.histogram(df_data[df_data['City'].isin(top_cities)] , x="Order Status", color="City")
fig.update_layout(title_text='Order status with Respect to Top Cities',
                   xaxis_title_text='Order Status', 
    yaxis_title_text='Count')
fig.show()

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['City'].value_counts().rename_axis('City').reset_index(name='counts')
order=completeorder.head(10)
top_cities = pd.Series(order.City)
df_data[df_data['City'].isin(top_cities)]
fig = px.histogram(df_data[df_data['City'].isin(top_cities)] , x="Payment Method", color="City")
fig.update_layout(title_text='Payment Method with Respect to Top cities',
                   xaxis_title_text='Payment Method', 
    yaxis_title_text='Count')
fig.show()

In [None]:
df_data['date_only'] = pd.to_datetime(df_data['Date'])  
df_data['Year'] = [d.year for d in  df_data["date_only"]]

In [None]:
completeorder=df_data[df_data['Order Status']=="Completed"]['Book Name'].value_counts().rename_axis('Book Name').reset_index(name='counts')
order=completeorder.head(10)
top_Books = pd.Series(order["Book Name"])

fig = px.histogram(df_data[df_data['Book Name'].isin(top_Books)], x="Book Name", color="Year")
fig.show()

In 2019 Data Science was a bestseller with 303 copies.

In 2020 the books online and the bestseller was Earn Money online with 2206 copies.

In 2021 the best seller is Lucy Draw book with 487 copies and the second best was Earn Money online with 373 copies sold in the first few days.


In [None]:
p_table = df_data[['Date', 'Order Number', 'Order Status','Year']]
p_table['Date'] = pd.to_datetime(df_data.Date.iloc[:])
p_table = p_table.groupby(['Date','Year'])['Order Number'].count().reset_index()
p_table.head()

# Relation with Date and Numbers of Orders and OLS Trend line

In [None]:
fig = px.scatter(p_table, x="Date", y="Order Number",trendline="ols")
fig.show()

# Time series forecasting 

In [None]:
from fbprophet import Prophet
sales_pred = Prophet(interval_width = 0.95)
# renaming the columns, since fbpropher requires each column to be used to be renamed as ds and y
sales = p_table.rename(columns={'Date': 'ds', 'Order Number': 'y'})
# fitting the model
sales_pred.fit(sales)

# forecasting the number of sales for the next 12 Months
sales_forecast = sales_pred.make_future_dataframe(periods=12, freq='MS')
sales_forecast = sales_pred.predict(sales_forecast)

# visualizing the predictions
plt.figure(figsize=(12, 6))
sales_pred.plot(sales_forecast, xlabel = 'Date', ylabel = 'Sales')
plt.title('Book Sales')

#  Trend, yearly seasonality, and weekly seasonality of the time series.

In [None]:
fig2 = sales_pred.plot_components(sales_forecast)


## This is second version of EDA. Work in progress and if you like my work do "up vote"