# **Gufhtugu Publisher is interested to know about the following questions:**


1. What is the best-selling book?
2. Visualize order status frequency
3. Find a correlation between date and time with order status
4. Find a correlation between city and order status
5. Find any hidden patterns that are counter-intuitive for a layman
6. Can we predict number of orders, or book names in advance?

Have answered first two question here:


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

!pip install -U textblob
!pip install googletrans
#Import libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from googletrans import Translator # translate cities name into english 
from textblob import TextBlob

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [None]:
#Read Dataset
df = pd.read_csv("../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv",encoding="utf-8", delimiter=',')
df.sample(20)


In [None]:
#shape of the data (row, and column)
Row, Col = df.shape
print(f'There are {Row} Rows and {Col} columns')

In [None]:
#need to rename columns, as they contain spaces in between which causes error
df = df.rename(columns={'Order Number': 'Order_Number',"Order Status":"Order_Status", "Book Name":"Book_Name","Order Date & Time":"Order_Date","City":"City","Payment Method":"Payment_Method", "Total items":"Total_items","Total weight (grams)":"grams" })


In [None]:
#Check Null values in data set
df.isnull().sum().sort_values(ascending = False).to_frame('counts')

In [None]:
#display rows with missing data to understand it 

df[(df.apply(lambda x: sum(x.isnull().values), axis = 1)>0)]

In [None]:
#completed, returned, cancelled orders (by customers)
df.Order_Status.value_counts().to_frame('count')

In [None]:
# Preprocess the date
# Thanks to  @asim zahid

df["Order_Date"] = pd.DatetimeIndex(df["Order_Date"])
df['date'] = df['Order_Date'].dt.date
df['time'] = df['Order_Date'].dt.time
df["Day_Name"] = df["Order_Date"].dt.day_name()
df["Week_Day"] = df["Order_Date"].dt.dayofweek
df["DayofYear"] = df["Order_Date"].dt.dayofyear
df["Month_Number"] = df["Order_Date"].dt.month
df["Month_Name"] = df["Order_Date"].dt.month_name()
df['year'] = df["Order_Date"].dt.year
df.sample(10)

In [None]:
#payment methods used by customers in Pakistan
df['Payment_Method'] = df['Payment_Method'].replace({"Cash on Delivery (COD)": "Cash on delivery"})
df.Payment_Method.value_counts().to_frame('counts')

In [None]:
#display graphically the payment methods:
# Set the width and height of the figure
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,4))

df.Payment_Method.value_counts().plot(kind='bar')

# Rotate the x-labels by 30 degrees, and keep the text aligned horizontally
plt.xticks(rotation=30, horizontalalignment="center")
plt.title("Payment Methods opted by customers in Pakistan")
plt.xlabel("Payment Modes")
plt.ylabel("Frequency (orders delivered to customers)")
plt.show()

In [None]:
#Check NaN in Book_Name
df[df['Book_Name'].isna()]



In [None]:
#Delete all NaN from dataframe
df.dropna(inplace=True)
df.isnull().sum()


In [None]:
#Split the Book_Name column with '/'and make a new row (Data Cleaning)
split_col = df['Book_Name'].str.split('/', expand=True).stack()

# Melting dataframe so that we have one book in each row
split_col.index = split_col.index.droplevel(-1) # to line up with df's index
split_col.name = 'Book_Name' # needs a name to join

df = df.drop(columns='Book_Name').join(split_col)
df.head(15)

In [None]:
#Convert Book_Name and City to lowercase
df['Book_Name'] = df['Book_Name'].str.lower()
df['City'] = df['City'].str.lower()
df['City'].head(8)


In [None]:


# Source: https://simplemaps.com/data/pk-cities

pakistan_major_cities = ['karachi', 'lahore', 'sialkot', 'faisalabad', 'rawalpindi',
       'peshawar', 'saidu sharif', 'multan', 'gujranwala', 'islamabad',
       'quetta', 'bahawalpur', 'sargodha', 'new mirpur', 'chiniot',
       'sukkur', 'larkana', 'shekhupura', 'jhang', 'rahimyar khan',
       'gujrat', 'kasur', 'mardan', 'mingaora', 'dera ghazi khan',"dgk"
       'nawabshah', 'sahiwal', 'mirpur khas', 'okara', 'burewala',
       'jacobabad', 'saddiqabad', 'kohat', 'muridke', 'muzaffargarh',
       'khanpur', 'gojra', 'bahauddin', 'abbottabad', 'dadu',
       'khuzdar', 'pakpattan', 'tando allahyar', 'vihari', 'jaranwala',
       'kamalia', 'kot addu', 'nowshera', 'swabi', 'dera ismail khan',
       'chaman', 'charsadda', 'kandhkot', 'hasilpur', 'muzaffarabad',
       'mianwali', 'jalalpur","jattan', 'bhakkar', 'zhob', 'kharian',
       'mian channun', 'jamshoro', 'pattoki', 'harunabad',
       'toba tek singh', 'shakargarh', 'hujra", "shah", "muqim', 'kabirwala',
       'mansehra', 'lala musa', 'nankana sahib', 'bannu', 'timargara',
       'parachinar', 'gwadar', 'abdul hakim', 'hassan", "abdal', 'tank',
       'hangu', 'risalpur cantonment', 'karak', 'kundian', 'umarkot',
       'chitral', 'dainyor', 'kulachi', 'kotli', 'gilgit',
       'hyderabad', 'narowal', 'khairpur', "mir’s", 'khanewal', 'jhelum',
       'haripur', 'shikarpur', 'rawala kot', 'hafizabad', 'lodhran',
       'malakand', 'attock', 'batgram', 'matiari', 'ghotki',
       'firoz','naushahro', 'alpurai', 'bagh', 'daggar', 'bahawalnagar',
       'leiah', 'tando muhammad khan', 'chakwal', 'khushab', 'badin',
       'lakki', 'rajanpur', 'dera allahyar', 'shahdad kot', 'pishin',
       'sanghar', 'upper dir', 'thatta', 'dera murad jamali', 'kohlu',
       'mastung', 'dasu', 'athmuqam', 'loralai', 'barkhan',
       'musa khel bazar', 'ziarat', 'gandava', 'sibi', 'dera bugti',
       'eidgah', 'turbat', 'uthal', 'chilas', 'kalat', 'panjgur', 'gakuch',
       'qila', 'saifullah', 'kharan', 'aliabad', 'awaran', 'dalbandin']

In [None]:
#clean the City col:
single_word_cities = df[df["City"].str.split().apply(len) == 2]["City"].unique()
single_word_cities[:20]

In [None]:
import nltk
#function to clean City data
def clean_city(row):
    address = row.City.split()
    add = set()
    for a in address:
        a = a.strip()
        if a:
            add.add(a)
    for city in pakistan_major_cities:
        if row.City.__contains__(city):
            return city
        
    for a in add:
        for c in pakistan_major_cities:
            if nltk.edit_distance(a, c) <= 5: # considering spelling mistakes upto 5 letters
                return c
    return row.City


In [None]:
#apply the function made above to clean cities
df["City"] = df.apply(clean_city, axis=1)
df['City'].head()

In [None]:
#Convert the 'day' column to day_name

df['order_date']= pd.to_datetime(df['Order_Date'])
 
#Extracting year,month and day
df['year'] = df['order_date'].apply(lambda x : x.year)
df['month'] = df['order_date'].apply(lambda x : x.month)
df['day'] = df['order_date'].apply(lambda x : x.day_name())
df['weekday'] = df['order_date'].apply(lambda x : x.weekday())

#Rearranging the columns
df_new=df[['Order_Number', 'Order_Status', 'Book_Name', 'Order_Date', 'City', 'year', 'month', 'day','weekday']]
df_new.head()

In [None]:
# Find Daily sales
daily_sales = df.groupby(["day"])["Book_Name"].agg(["count"]).reset_index()
daily_sales.sort_values("day",ascending = True)

In [None]:
# Monthly sales
month_sales = df.groupby(["month"])["Book_Name"].agg(["count"]).reset_index()
month_sales.sort_values("month",ascending = True)

In [None]:
#Finding Top 10 book sale
#
import seaborn as sns

#Setting plot style
plt.figure(figsize = (30, 15))
plt.style.use('seaborn-white')

#Top 10 fast moving products
plt.subplot(1,2,1)
ax=sns.countplot(y="Book_Name", hue="year", data=df_new, palette="pastel",
              order=df_new.Book_Name.value_counts().iloc[:10].index)

ax.set_xticklabels(ax.get_xticklabels(),fontsize=11,rotation=40, ha="right")
ax.set_title('Top 10 books',fontsize= 30)
ax.set_xlabel('Frequency (of purchase)',fontsize = 15) 
ax.set_ylabel('Top 10 Books', fontsize = 15)
plt.tight_layout()

In [None]:
#Top 10 cities
city_sales1 = df_new['City'].value_counts()[:10].index.tolist()
city_sales2 = df_new['City'].value_counts().unique()
city_sales = list(zip(city_sales1, city_sales2)) 
city_sales = pd.DataFrame(city_sales, 
                  columns = ['City', 'counts']) 
city_sales

In [None]:
plt.figure(figsize = (15,8))
sns.barplot(x =city_sales["City"], y =city_sales["counts"],color = "Orange",label = "count")
plt.xlabel("City")
plt.ylabel("Sale")
plt.title("City wise Sale")
plt.xticks(rotation = 50)
plt.legend()
plt.show()