In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
csv_file = '/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv'

#Ofcourse suppressing warnings is evil but sometimes they ought to be suppressed
import warnings
warnings.filterwarnings("ignore")


# Loading The Data and Displaying Random Records

In [None]:
df = pd.read_csv(csv_file)

df = df.rename(columns={'Order Number': 'order_number',"Order Status":"order_status", "Book Name":"book_name",
                        "Order Date":"order_date","City (Billing)":"city"})



df['city'] = df['city'].str.lower()
df['book_name'] = df['book_name'].str.lower()
df['order_status'] = df['order_status'].str.lower()

#converting order_date to pandas datetime format
df['order_date'] = pd.to_datetime(df['order_date'])


df.sample(35)

In [None]:
df.info()

# Data Cleaning and Normalization
This dataset comprises more than 19,000 orders. Before the data analysis part,the following columns should be normalized
  - Billing City 
  - Book Name

The Billing City column does not follow a strict schema for addresses. Some records only have a city name in their respective 'Billing City' fields while others possess a street address.

If multiple books were purchased in a single order, they are recorded as a single string separated by a slash **/**. All books inside an order should be taken into account before reporting the best selling books.

In [None]:
df['city'].nunique()

In [None]:
#if an address contains the name of a Pakistani city from the given list, the entire address is replaced with the name of the city only

#list of pakistani cities obtained from https://gist.github.com/malikbilal1997/4f41d4d153fca7087a8875cac7db8836
pak_cities = ['islamabad', 'ahmed nager chatha', 'ahmadpur east', 'ali khan abad', 'alipur', 'arifwala', 'attock', 'bhera', 'bhalwal', 'bahawalnagar', 'bahawalpur', 'bhakkar', 'burewala', 'chillianwala', 'chakwal', 'chichawatni', 'chiniot', 'chishtian', 'daska', 'darya khan', 'dera ghazi khan', 'dhaular', 'dina', 'dinga', 'dipalpur', 'faisalabad', 'ferozewala', 'fateh jhang', 'ghakhar mandi', 'gojra', 'gujranwala', 'gujrat', 'gujar khan', 'hafizabad', 'haroonabad', 'hasilpur', 'haveli lakha', 'jatoi', 'jalalpur', 'jattan', 'jampur', 'jaranwala', 'jhang', 'jhelum', 'kalabagh', 'karor lal esan', 'kasur', 'kamalia', 'kamoke', 'khanewal', 'khanpur', 'kharian', 'khushab', 'kot addu', 'jauharabad', 'lahore', 'lalamusa', 'layyah', 'liaquat pur', 'lodhran', 'malakwal', 'mamoori', 'mailsi', 'mandi bahauddin', 'mian channu', 'mianwali', 'multan', 'murree', 'muridke', 'mianwali bangla', 'muzaffargarh', 'narowal', 'nankana sahib', 'okara', 'renala khurd', 'pakpattan', 'pattoki', 'pir mahal', 'qaimpur', 'qila didar singh', 'rabwah', 'raiwind', 'rajanpur', 'rahim yar khan', 'rawalpindi', 'sadiqabad', 'safdarabad', 'sahiwal', 'sangla hill', 'sarai alamgir', 'sargodha', 'shakargarh', 'sheikhupura', 'sialkot', 'sohawa', 'soianwala', 'siranwali', 'talagang', 'taxila', 'toba tek singh', 'vehari', 'wah cantonment', 'wazirabad', 'badin', 'bhirkan', 'rajo khanani', 'chak', 'dadu', 'digri', 'diplo', 'dokri', 'ghotki', 'haala', 'hyderabad', 'islamkot', 'jacobabad', 'jamshoro', 'jungshahi', 'kandhkot', 'kandiaro', 'karachi', 'kashmore', 'keti bandar', 'khairpur', 'kotri', 'larkana', 'matiari', 'mehar', 'mirpur khas', 'mithani', 'mithi', 'mehrabpur', 'moro', 'nagarparkar', 'naudero', 'naushahro feroze', 'naushara', 'nawabshah', 'nazimabad', 'qambar', 'qasimabad', 'ranipur', 'ratodero', 'rohri', 'sakrand', 'sanghar', 'shahbandar', 'shahdadkot', 'shahdadpur', 'shahpur chakar', 'shikarpaur', 'sukkur', 'tangwani', 'tando adam khan', 'tando allahyar', 'tando muhammad khan', 'thatta', 'umerkot', 'warah', 'abbottabad', 'adezai', 'alpuri', 'akora khattak', 'ayubia', 'banda daud shah', 'bannu', 'batkhela', 'battagram', 'birote', 'chakdara', 'charsadda', 'chitral', 'daggar', 'dargai', 'darya khan', 'dera ismail khan', 'doaba', 'dir', 'drosh', 'hangu', 'haripur', 'karak', 'kohat', 'kulachi', 'lakki marwat', 'latamber', 'madyan', 'mansehra', 'mardan', 'mastuj', 'mingora', 'nowshera', 'paharpur', 'pabbi', 'peshawar', 'saidu sharif', 'shorkot', 'shewa adda', 'swabi', 'swat', 'tangi', 'tank', 'thall', 'timergara', 'tordher', 'awaran', 'barkhan', 'chagai', 'dera bugti', 'gwadar', 'harnai', 'jafarabad', 'jhal magsi', 'kacchi', 'kalat', 'kech', 'kharan', 'khuzdar', 'killa abdullah', 'killa saifullah', 'kohlu', 'lasbela', 'lehri', 'loralai', 'mastung', 'musakhel', 'nasirabad', 'nushki', 'panjgur', 'pishin valley', 'quetta', 'sherani', 'sibi', 'sohbatpur', 'washuk', 'zhob', 'ziarat']

def get_nearest_city(city):
  for cand_city in pak_cities:
    if cand_city in str(city):
      return cand_city
  return city 

print(f'total unique cities in our dataset before normalization: {df.city.nunique()}')

df['city'] = df['city'].apply(get_nearest_city)

print(f'total unique cities in our dataset after normalization: {df.city.nunique()}')


There were more than 3500 unique billing addresses after case conversion. Now, there are only 1800 which is a significant improvement. 

In [None]:
#since there are multiple books inside an order, I have added another column 'order_size' 
#to track the number of books purchased per order.

def get_order_size(order):
  return str(order).count('/') + 1

df['order_size'] = df['book_name'].apply(get_order_size)

## Dropping Rows With Missing Data

In [None]:
#rows containing missing data
df[(df.apply(lambda x: sum(x.isnull().values), axis = 1)>0)]

In [None]:
df = df.dropna()

# Exploratory Data Analysis (EDA)

## Which is the best-selling book?

In [None]:
#Multiple books can be purchased in a single transaction. Counting rows with the same book_names would give us inaccurate results.
#We need to extract all books from an order before counting.

def split_series(ser,sep):
    return pd.Series(ser.str.cat(sep=sep).split(sep=sep)) 


df2=(df.groupby(df.columns.drop('book_name').tolist()) #group by all but one column
          ['book_name'] #select the column to be split
          .apply(split_series,sep='/') # split 'book_name' in each group
         .reset_index(drop=True,level=-1).reset_index()) #remove extra index created


In [None]:
#visualizing the top 10 best selling books
print('Top 10 most selling books are')

df2[['book_name','city']].groupby(['book_name'])['city'].count().nlargest(10).plot.barh()

In [None]:
#printing top 10 best selling books
df2[['book_name','city']].groupby(['book_name'])['city'].count().nlargest(10)

In [None]:
#printing the best selling book title
name = df2[['book_name','city']].groupby(['book_name'])['city'].count().idxmax()
copies_sold = df2[['book_name','city']].groupby(['book_name'])['city'].count().max()

print(f'{name} is the best selling book with {copies_sold} copies sold')

## Exploring Order Status

In [None]:
#visualizing the order status frequency
sns.countplot(x='order_status',data=df)

In [None]:
#printing the order status frequencies in tabular form
df.order_status.value_counts()

### Orders Per Weekday

In [None]:
#adding a new column for weekday
df['weekday'] = pd.to_datetime(df['order_date']).dt.day_name()

In [None]:
#visualizing orders status per weekday
plt.figure(figsize=(8,6))
sns.countplot(x='weekday',hue='order_status',data=df,order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])

The above visualization indicates that more orders are received on weekends than on weekdays

### Orders Per Hour

In [None]:
#For the time, I have rounded the all timestamps to their nearest hour
#For example, 22:27:00 is rouned to 22:00:00 while 22:31:00 is rouned to 23:00:00
df['time'] = pd.to_datetime(df['order_date'].dt.round('60min')).dt.time

In [None]:
plt.figure(figsize=(22,8))
temp = df.groupby(['order_status', 'time']).size().reset_index(name='count').sort_values('time')
sns.barplot(x="time",y='count' ,hue="order_status", data=temp)

The visualization shows that more orders were received after 12pm than before 12pm


In [None]:
#Orders received before 12pm
import datetime
plt.figure(figsize=(12,8))
temp = df[df['time']<=datetime.time(12,0)].groupby(['order_status', 'time']).size().reset_index(name='count').sort_values('time')
sns.barplot(x="time",y='count' ,hue="order_status", data=temp)

In [None]:
#Orders received after 12pm
plt.figure(figsize=(12,8))
temp = df[df['time']>=datetime.time(12,0)].groupby(['order_status', 'time']).size().reset_index(name='count').sort_values('time')
sns.barplot(x="time",y='count' ,hue="order_status", data=temp)

This shows that more orders are received on weekends and the frequency of orders is much higher on the second part of the day.The number of orders returned or canceled vary *directly* with the total number of orders received during an hour


In [None]:
#printing the orders received before and after 12pm
before_12 = df[df['time'] <= datetime.time(12, 0)].groupby(['order_status', 'time']).size().reset_index(name='count')['count'].sum()
after_12 = df[df['time'] >= datetime.time(12, 0)].groupby(['order_status', 'time']).size().reset_index(name='count')['count'].sum()
print(f'orders received before 12pm: {before_12}')
print(f'orders received after 12pm: {after_12}')

### Orders Per City

In [None]:
#visualizing top 15 cities with most orders

top_n_cities = 15
cities = df.groupby(['city']).size().reset_index(name='count').sort_values('count',ascending=False)[:top_n_cities]['city'].values

temp = df[df['city'].isin(cities)].groupby(['city', 'order_status']).size().reset_index(name='count').sort_values('city')

plt.figure(figsize=(18,8))
sns.barplot(x="city",y='count' ,hue="order_status", data=temp)

In [None]:
#cities whose citizens returned orders the most
most_returned = df[df['order_status']=='returned'].groupby(['city','order_status']).size().reset_index(name='count').sort_values('count',ascending=False).head(10)
plt.figure(figsize=(18,8))
sns.barplot(x="city",y='count' ,hue="order_status", data=most_returned)

In [None]:
#cities whose citizens canceled orders the most
most_returned = df[df['order_status']=='canceled'].groupby(['city','order_status']).size().reset_index(name='count').sort_values('count',ascending=False).head(10)
plt.figure(figsize=(18,8))
sns.barplot(x="city",y='count' ,hue="order_status", data=most_returned)

### Does a Large Order Size Indicate Fraud?


In [None]:
#The only way to test this hypothesis is to provide evidence from data
#Recap: order_size indicates how many books were purchased in a single order
df.groupby(['order_size', 'order_status']).size().reset_index(name='count')

The data speaks against the hypothesis 😃 

In [None]:
#TODO
# Improve the previous sections
# Sales forcasting 