In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv", encoding="utf-8")

In [None]:
data.columns

In [None]:
data.columns = ['OrderNumber', 'OrderStatus', 'BookName', 'OrderDate',
       'BillingCity']

In [None]:
data.head()

In [None]:
data.set_index("OrderNumber", inplace=True)

##  Number of rows and columns in this data

In [None]:
data.shape

# Data Cleaning and pre-processing

Dealing with null values

In [None]:
data.isna().sum()

As there are only 3 missing cells in the whole data frame. We will be dropping those corresponding rows.

In [None]:
data.dropna(inplace=True)
data.isna().sum()

In [None]:
# New shape of the data frame
data.shape

In [None]:
data.head()

Let's normalize the BookName and BillingCity column by lower casing it and by strpping any extra spaces

In [None]:
data["BookName"] = data.BookName.apply(lambda name: name.lower().strip())
data["BillingCity"] = data.BillingCity.apply(lambda city: city.lower().replace("/", " ").replace(",", " "). replace(".", " ").strip())

The bookname contains more than 1 books in some cases(Ex. OrderNumber 70960). We need to separate those book names and create new rows.

In [None]:
data = data.assign(BookName = data.BookName.str.split("/")).explode("BookName")

In [None]:
data.isna().sum()

In [None]:
data.dtypes

Let's convert the OrderDate to datetime 

In [None]:
# using infer_datetime_format=True as it is faster
data["OrderDate"] = pd.to_datetime(data["OrderDate"], infer_datetime_format=True)

In [None]:
data

Cleaning the city column

In [None]:
# Source: https://simplemaps.com/data/pk-cities

pakistan_top_cities = ['karachi', 'lahore', 'sialkot', 'faisalabad', 'rawalpindi',
       'peshawar', 'saidu sharif', 'multan', 'gujranwala', 'islamabad',
       'quetta', 'bahawalpur', 'sargodha', 'new mirpur', 'chiniot',
       'sukkur', 'larkana', 'shekhupura', 'jhang', 'rahimyar khan',
       'gujrat', 'kasur', 'mardan', 'mingaora', 'dera ghazi khan',"dgk"
       'nawabshah', 'sahiwal', 'mirpur khas', 'okara', 'burewala',
       'jacobabad', 'saddiqabad', 'kohat', 'muridke', 'muzaffargarh',
       'khanpur', 'gojra', 'bahauddin', 'abbottabad', 'dadu',
       'khuzdar', 'pakpattan', 'tando allahyar', 'vihari', 'jaranwala',
       'kamalia', 'kot addu', 'nowshera', 'swabi', 'dera ismail khan',
       'chaman', 'charsadda', 'kandhkot', 'hasilpur', 'muzaffarabad',
       'mianwali', 'jalalpur","jattan', 'bhakkar', 'zhob', 'kharian',
       'mian channun', 'jamshoro', 'pattoki', 'harunabad',
       'toba tek singh', 'shakargarh', 'hujra", "shah", "muqim', 'kabirwala',
       'mansehra', 'lala musa', 'nankana sahib', 'bannu', 'timargara',
       'parachinar', 'gwadar', 'abdul hakim', 'hassan", "abdal', 'tank',
       'hangu', 'risalpur cantonment', 'karak', 'kundian', 'umarkot',
       'chitral', 'dainyor', 'kulachi', 'kotli', 'gilgit',
       'hyderabad', 'narowal', 'khairpur', "mir’s", 'khanewal', 'jhelum',
       'haripur', 'shikarpur', 'rawala kot', 'hafizabad', 'lodhran',
       'malakand', 'attock', 'batgram', 'matiari', 'ghotki',
       'firoz','naushahro', 'alpurai', 'bagh', 'daggar', 'bahawalnagar',
       'leiah', 'tando muhammad khan', 'chakwal', 'khushab', 'badin',
       'lakki', 'rajanpur', 'dera allahyar', 'shahdad kot', 'pishin',
       'sanghar', 'upper dir', 'thatta', 'dera murad jamali', 'kohlu',
       'mastung', 'dasu', 'athmuqam', 'loralai', 'barkhan',
       'musa khel bazar', 'ziarat', 'gandava', 'sibi', 'dera bugti',
       'eidgah', 'turbat', 'uthal', 'chilas', 'kalat', 'panjgur', 'gakuch',
       'qila', 'saifullah', 'kharan', 'aliabad', 'awaran', 'dalbandin']

In [None]:
single_word_cities = data[data["BillingCity"].str.split().apply(len) == 2]["BillingCity"].unique()
single_word_cities[:30]

In [None]:
import nltk
def clean_city(row):
    address = row.BillingCity.split()
    add = set()
    for a in address:
        a = a.strip()
        if a:
            add.add(a)
    for city in pakistan_top_cities:
        if row.BillingCity.__contains__(city):
            return city
        
    for a in add:
        for c in pakistan_top_cities:
            if nltk.edit_distance(a, c) <= 3: # considering spelling mistakes upto 3 letters
                return c
    return row.BillingCity

In [None]:
# Number of cities before ceaning
data["BillingCity"].nunique()

In [None]:
data["BillingCity"] = data.apply(clean_city, axis=1)

In [None]:
# Number of cities after ceaning
data["BillingCity"].nunique()

In [None]:
single_word_cities = data[data["BillingCity"].str.split().apply(len) == 1]["BillingCity"].unique()

# **EDA**

Number of books sold per billing city

In [None]:
books_sold_per_city = data.groupby(by="BillingCity")["BookName"].count().sort_values(ascending=False)

In [None]:
# Top 20 Cities
books_sold_per_city.head(20)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16, 6))
ax = sns.barplot(books_sold_per_city[:15].index,books_sold_per_city[:15].values)

**Top selling books on Gufhtugu** 

In [None]:
top_selling = data["BookName"].value_counts()

In [None]:
top_selling[:20]

In [None]:
top_selling[:20].plot.bar()

In [None]:
order_status = data["OrderStatus"].value_counts()

In [None]:
order_status

In [None]:
data[data["OrderStatus"] == "Canceled"].groupby(by=["BillingCity"])["BillingCity"].count().sort_values(ascending=False)

In [None]:
data[data["OrderStatus"] == "Returned"].groupby(by=["BillingCity"])["BillingCity"].count().sort_values(ascending=False)

In [None]:
data[data["OrderStatus"] == "Canceled"].groupby(by=["BillingCity","BookName"])["BookName"].count()

In [None]:
data[data["OrderStatus"] == "Returned"].groupby(by=["BillingCity","BookName"])["BookName"].count().sort_values(ascending=False)

**Time Series analysis**

In [None]:
data.reset_index(inplace=True)

In [None]:
data.set_index("OrderDate", inplace=True)

In [None]:
data.head()

Books Sold per week

In [None]:
title = "Gufhtugu: books sold per week"
ax = data.resample("W")["BookName"].count().plot(figsize=(12,6), title=title)
ax.set(xlabel="Week", ylabel="Count")


Books sold each month

In [None]:
title = "Gufhtugu: books sold per month"
ax = data.resample("M")["BookName"].count().plot(figsize=(12,6), title=title)
ax.set(xlabel="Month", ylabel="Count")

In [None]:
day_wise_data = pd.DataFrame(data.resample("D")["BookName"].count().rename("BooksSold"))
day_wise_data.head()

In [None]:
day_wise_data["BooksSold_7day_mean"] = day_wise_data.rolling(7).mean()
day_wise_data.head()

Current trend 

In [None]:
ax = day_wise_data.plot(figsize=(16,6))

  **Statsmodels: hodrick prescott filter**

In [None]:
from statsmodels.tsa.filters.hp_filter import hpfilter

In [None]:
month_wise_data = pd.DataFrame(data.resample("M")["BookName"].count().rename("BooksSold"))

In [None]:
sell_cycle, sell_trend = hpfilter(day_wise_data["BooksSold"], lamb=129600)

In [None]:
sell_trend.plot(figsize=(16,6))

In [None]:
sell_cycle.plot(figsize=(16,6))

ETS decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
r = seasonal_decompose(day_wise_data["BooksSold"])

In [None]:
from pylab import rcParams
rcParams["figure.figsize"] = 16,6
r.plot();