# Exploring

In [None]:
!pip install -q -U seaborn
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly as pl
import lightgbm as lgb

## Importing Datasets and Cleaning

In [None]:
db=pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv',
               parse_dates=['Order Date'])

db.head()

In [None]:
print('Checking null values in each Columns')
db.isnull().sum()

In [None]:
print("Bestseller :",db['Book Name'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # bestseller
print("Top City :",db['City (Billing)'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # top city for orders

db['Book Name'] = db['Book Name'].fillna('انٹرنیٹ سے پیسہ کمائیں') # filling null values with most sold book
db['City (Billing)'] = db['City (Billing)'].fillna('Karachi') # filling null valuse with top city

display("Checking null values in each Columns")
db.isnull().sum()

In [None]:
db['Date'] = [d.date() for d in db['Order Date']]
db['Time'] = [d.time() for d in db['Order Date']]
db['Month'] = [d.month for d in db['Order Date']]
db['Month_Year'] = [d.to_period("M") for d in db['Order Date']]
db['Year'] = [d.year for d in db['Order Date']]
db['Weekday'] = [d.week for d in db['Order Date']]
db.head()

# Geographical Analysis 

In [None]:
cities=pd.read_csv('../input/pakistan-cities/pk.csv')
cities.head()

In [None]:
cities['city'] = cities['city'].replace(['Sialkot City'],'Sialkot') # Replacing names for merggin DB

In [None]:
geo=db['City (Billing)'].value_counts().rename_axis('City').reset_index(name='counts')
clean_geo=geo.merge(cities,how='inner',left_on='City', right_on='city')
clean_geo=clean_geo[["City","lat","lng","admin_name",
         "population_proper","counts"]]
clean_geo=clean_geo.rename(columns={"lat":"Latitude","lng":"Longitude",
                                    "population_proper":"Population",
                                    "admin_name":"Province","counts":"Total Orders"})
clean_geo.head()

In [None]:
import plotly.graph_objects as go


clean_geo['text'] = clean_geo['City'] + '<br> Book Sold ' + (clean_geo['Total Orders']).astype(str)
# limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)]
 
limits = [(0,3),(3,9),(9,19),(19,49),(49,3000)]
colors = ["royalblue","crimson","lightseagreen","orange","lightgrey"]
scale = 5000

fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = clean_geo[lim[0]:lim[1]]
    fig.add_trace(go.Scattergeo(
        lon = df_sub['Longitude'],
        lat = df_sub['Latitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['Total Orders'],
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = 'Top {0} - {1}'.format(lim[0]+1,lim[1])))

fig.update_layout(
        title_text = 'Total Books Sold by Gufhtugu per City',
        showlegend = True,
        legend_title="Top Books",
        legend_title_font_size=14,
        geo = dict(
            scope = 'asia',
            landcolor = 'rgb(217, 217, 217)',
            lonaxis = dict(range = [60.578993, 82.65129]),
            lataxis = dict(range = [24.407138, 36.885931]),
            
        ),
        
    )

fig.show()

In [None]:
month=db[['City (Billing)','Month']].value_counts().rename_axis(['City','Month']).reset_index(name='counts')
month_geo=month.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Month'])
month_geo.head()

In [None]:
import plotly.express as px
# df = px.data.gapminder()
px.scatter(month_geo, x="Total Orders", y="counts", animation_frame="Month", animation_group="City",
           size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over month',
           log_x=True, size_max=40, range_x=[1,3000], range_y=[0,500])
# px.update_xaxes(autorange=True)

In [None]:
weekday=db[['City (Billing)','Weekday']].value_counts().rename_axis(['City','Week']).reset_index(name='counts')
weekday_geo=weekday.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Week'])

In [None]:
px.scatter(weekday_geo, x="Total Orders", y="counts", animation_frame="Week", animation_group="City",
           size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over Week',
           log_x=True, size_max=40, range_x=[1,3000], range_y=[0,200])

In [None]:
# sns.set_theme(style="whitegrid", palette="muted")
total_month=db['Month'].value_counts().rename_axis(['Month']).reset_index(name='counts')
sns.set(rc={'figure.figsize':(12,8)},style="whitegrid", palette="muted")
# Draw a categorical scatterplot to show each observation
ax = sns.barplot(data=total_month, x="Month", y="counts",palette='CMRmap')
ax.set(ylabel="",title="Number of Books sold per month",);
# ax.legend(loc='upper right', bbox_to_anchor=(0.3, 1), ncol=1);

In [None]:
db=db.merge(cities,how='left',left_on='City (Billing)', right_on='city').set_index('Order Number')


In [None]:
total=db[["Order Status","Book Name","Date","Time","City (Billing)",
         "lat","lng","population_proper","admin_name","Month"]]
total=total.rename(columns={"Order Status":"Status","Book Name":"Book","City (Billing)":"City",
         "lat":"Latitude","lng":"Longitude","population_proper":"Population","admin_name":"Province"})
total.head()

In [None]:
import matplotlib as mpl
ax = sns.histplot(data=total, x="Month", hue="Province",
    multiple="stack",
    palette="rocket",
    edgecolor=".3",binwidth=1,kde=True,
    linewidth=.5)
ax.set(ylabel="",title="Number of Books sold to province per month")
ax.set_xticklabels([ "",'Feb', 'Apr','Jun','Aug','Oct','Dec']);

# Simple Regression on Status

In [None]:
status=db[['Order Status','Month']].value_counts().rename_axis(['Status','Month']).reset_index(name='counts')
pal = dict(Completed="#6495ED", Returned="#F08080",Canceled="#90ee90")

# Show the survival probability as a function of age and sex
g = sns.lmplot(x="Month", y="counts", col="Status", hue="Status", data=status,
               palette=pal, y_jitter=.02, logistic=False, truncate=True,);

In [None]:
status_pro=total[['Status','Month','Province']].value_counts().rename_axis(['Status','Month','Province']).reset_index(name='counts')


g = sns.relplot(
    data=status_pro,
    x="Month", y="counts",
    hue="Status", size="Province",
    palette=pal, sizes=(10, 200),alpha=0.9,height=8,aspect=1.2
)
g.set( yscale="log")
g.set(ylabel="",title="Status of Books sold to Province per month")
g.despine(left=True, bottom=True);

## Catboost for Futher prediction

In [None]:
from catboost import CatBoostRegressor,Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

## Prediction By Weekday

In [None]:
train=db[['Order Status','Year','admin_name','City (Billing)','Weekday']].value_counts().rename_axis(['Status','Year','Province','City','Week']).reset_index(name='counts')

In [None]:
train[['Year','Week']]=train[['Year','Week']].astype('int')


In [None]:
DataX = train.drop(columns=['counts'])
Datay = train['counts'].values
x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3)

In [None]:
train_pool = Pool(x_train, 
                  y_train, 
                  cat_features=['Status','Province','City'])
test_pool = Pool(x_val, 
                  y_val, 
                  cat_features=['Status','Province','City'])

In [None]:
model=CatBoostRegressor(iterations=10000,
                             learning_rate=0.001,
                             depth=12,
                             eval_metric='RMSE',
                             random_seed = 23,
                             od_type='Iter',
                             metric_period = 100,
                             od_wait=100) # catboost is used for avoiding over fitting

In [None]:
model.fit(train_pool,
             eval_set=test_pool,
             use_best_model=True,
             verbose=False,plot=True);

In [None]:
predict=model.predict(test_pool)

In [None]:
res = model.calc_feature_statistics(train_pool,
                                    feature=1,
                                    plot=True)

In [None]:
import shap
shap.initjs()

## Using Shap for feature dependency 

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_train)

# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], x_train.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values, x_train)

In [None]:
shap.dependence_plot("Week", shap_values, x_train)

In [None]:
shap.summary_plot(shap_values, x_train)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[91,:],x_train.iloc[91,:])

In [None]:
x=range(len(predict))
y=y_val
ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF')
ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B')
ax.set(title='Trainig and predicted by Weekday data over the time');

## Test model with real world orders

In [None]:
Status ='Completed'
Year = 2021
Province = 'Punjab'
City = 'Lahore'
Week= 20
data1 = [Status,Year,Province,City,Week]
test1 = model.predict(data1)

In [None]:
print(int(test1),"Books Order",Status,"in week",Week,"-",Year," from ",City,",",Province,"." )

In [None]:
Status =['Completed','Completed','Completed','Returned']
Year = [2021,2021,2022,2021]
Province = ['Punjab','Punjab','Sindh','Punjab']
City = ['Lahore','Bahawalpur','Karachi','Lahore']
Week= [20,10,13,14]
data2 = pd.DataFrame(np.transpose([Status,Year,Province,City,Week]),
                     columns=['Status','Year','Province','City','Week'])
test2 = model.predict(data2).astype('int')

In [None]:
for i in range(len(test2)):
    print(i+1,')',test2[i],"Books Order",data2.Status[i],"in","in week",data2.Week[i],"-",data2.Year[i],
          " from ",data2.City[i],",",data2.Province[i],".\n" )

In [None]:
Week=DataX
Week['Year']=Week['Year'].replace([2019,2020],[2021,2022])

ypred=model.predict(Week)
Week['counts']=ypred.astype('int')
Totaldf=pd.concat([train, Week], axis=0)
Totaldf.shape

In [None]:

ax=sns.lineplot(x='Week',y='counts',hue='Year',data=Totaldf,palette='Set2')
ax.set( yscale="log")
ax.set(title='Trainig and predicted by Weekday data over the time');

## Prediction by Month

In [None]:
train1=db[['Order Status','Month','Year','admin_name','City (Billing)']].value_counts().rename_axis(['Status','Month','Year','Province','City']).reset_index(name='counts')
train1[['Year','Month']]=train1[['Year','Month']].astype('int')
DataX = train1.drop(columns=['counts'])
Datay = train1['counts'].values
x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3)
train_pool = Pool(x_train, 
                  y_train, 
                  cat_features=['Status','Province','City'])
test_pool = Pool(x_val, 
                  y_val, 
                  cat_features=['Status','Province','City'])

In [None]:
model=CatBoostRegressor(iterations=10000,
                             learning_rate=0.001,
                             depth=12,
                             eval_metric='RMSE',
                             random_seed = 23,
                             od_type='Iter',
                             metric_period = 100,
                             od_wait=100)
model.fit(train_pool,
             eval_set=test_pool,
             use_best_model=True,
             verbose=False,plot=True);

In [None]:
predict=model.predict(test_pool)
res = model.calc_feature_statistics(train_pool,
                                    feature=1,
                                    plot=True)

In [None]:
x=range(len(predict))
y=y_val
ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF')
ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B')
ax.set(title='Trainig and predicted by Month data over the time');

In [None]:
Make = train1.drop(columns=['counts'])
Week=Make
Week['Year']=Week['Year'].replace([2019,2020],[2021,2022])

ypred=model.predict(Week)
Week['counts']=ypred.astype('int')
Total=pd.concat([train1, Week], axis=0)
ax=sns.lineplot(x='Month',y='counts',hue='Year',data=Total,palette='Set2')
ax.set( yscale="log")
ax.set(title='Trainig and predicted by Month data over the time');

## Simpler Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
ReadDb=pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv',
                  parse_dates=['Order Date & Time'])
ReadDb['Month'] = [d.month for d in ReadDb['Order Date & Time']]
ReadDb['Month_Year'] = [d.to_period("M") for d in ReadDb['Order Date & Time']]
ReadDb['Year'] = [d.year for d in ReadDb['Order Date & Time']]
ReadDb['Weekday'] = [d.week for d in ReadDb['Order Date & Time']]
Revenue=ReadDb[['Month','Year']].value_counts().rename_axis(['Month','Year']).\
   reset_index(name='counts')
Revenue["Sale"]=Revenue['counts']*500

Xpred=[i for i in range(2,13)]
pred=pd.DataFrame()
pred['Month']=Xpred
pred['Year']=2021

Revenue.sort_values(by=['Year','Month'],inplace=True)
reg=LinearRegression()
X = Revenue.drop(columns=['counts','Sale'])
y = Revenue['Sale'].values
reg.fit(X,y)

prediction_rev=reg.predict(pred)

ax=sns.lineplot(x=list(range(16)),y=Revenue.Sale,label="Past Sale",color='#9FD1FF');
ax=sns.lineplot(x=[i for i in range(15,len(prediction_rev)+15)],y=prediction_rev,
                label="Future Sale",color='#FF8B8B');
ax.set(title='Predicting future sales till December 2021 Using Linear Regression ');

# Best Seller

### Best Seller per year

In [None]:
Year_books=db[['Book Name','Year']].value_counts().rename_axis(['Book','Year']).reset_index(name='counts')

In [None]:
Year2019=Year_books[Year_books['Year']==2019].nlargest(10, 'counts')
Year2020=Year_books[Year_books['Year']==2020].nlargest(10, 'counts')
Year2021=Year_books[Year_books['Year']==2021].nlargest(10, 'counts')

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 10))

# Generate some sequential data
sns.barplot(x='Book', y='counts', palette='Set2', ax=ax1,data=Year2019)
ax1.axhline(0, color="k", clip_on=False)
ax1.set_ylabel("2019")


# Center the data to make it diverging

sns.barplot(x='Book', y='counts', palette='Set2', ax=ax2,data=Year2020)
ax2.axhline(0, color="k", clip_on=False)
ax2.set_ylabel("2020")

# Randomly reorder the data to make it qualitative

sns.barplot(x='Book', y='counts', palette='Set2', ax=ax3,data=Year2021)
ax3.axhline(0, color="k", clip_on=False)
ax3.set_ylabel("2021")

# Finalize the plot
sns.despine(bottom=True)
plt.setp(f.axes, yticks=[])
for p in ax1.patches:
    ax1.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
for p in ax2.patches:
    ax2.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
for p in ax3.patches:
    ax3.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
    
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax3.set_xticklabels(ax3.get_xticklabels(),rotation=90)

ax1.set_xticklabels([ "Data Science",'Kaggle', 'R ka Taaruf','(C++)','Apna Elaaj','Shaoor','Molo Masali',
                    'Kaggle for Begginers','CryptoCurrency','Blockchain']);
ax2.set_xticklabels([ "Earn Money",'Python Programming','Product Management', 'Blockchain','Justju ka safar',
                     'Artificial Intelligence','Molo Masali','(C++)',
                    'Python Programming 2020','Sukkur To Florida','Blockchain']);
ax3.set_xticklabels([ "Lucky Draw",'Earn Money','Column Nigari', 'Python Programming','Waqfa e Pareshani',
                     'Data Science','Arif Kareem','Machine Learning',
                    'Artificial Intelligence','Blockchain']);
plt.tight_layout(h_pad=2);
ax1.set(title='Best Seller in 2019');
ax2.set(title='Best Seller in 2020');
ax3.set(title='Best Seller in 2021');

* In 2019 Data Science was a bestseller with 303 copies and the rest of the top ten don't even compete.
* In 2020 trend changed as the pandemic started and people were stuck in the home so more people order the books online and the bestseller was Earn Money online with 2206 copies, which have a direct relationship with people losing jobs and stuck in their homes.
* In 2021 we have limited data so we can assume that the sales have increased since the pandemic and people reading trend have changed. the best seller is Lucy Draw book which is a gift and the second, best was Earn Money online with 373 copies sold in the first few days.

In [None]:
Month_books=db[['Book Name','Month','Year']].value_counts().rename_axis(['Book','Month','Year']).reset_index(name='counts')

In [None]:
Month_2019=Month_books.sort_values(by=['counts'],ascending=False)
Month_2019=Month_2019[Month_2019['Year']==2019]
Range=  pd.Series(range(10,13))
Top_Month=[]
Top_Book=[]
Top_counts=[]
for x in Range:
    Book=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Book'].values
    Month=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Month'].values
    Counts=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['counts'].values
    Top_Book.append(Book)
    Top_Month.append(Month)
    Top_counts.append(Counts)
Top_Month=[['Oct','Nov','Dec']]
Top_Book=np.transpose(Top_Book)
Top_counts=np.transpose(Top_counts)
Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(),
                   columns=['Month','Book','Sale'])
Total

In [None]:
ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h'
                ,dodge=False)
# ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90)
ax4.set_yticklabels([ "Kaggle for Begginers",'Apna Elaaj Khud Karay','Data Science'])
sns.despine(bottom=True)

plt.setp(f.axes, yticks=[]);
ax4.set(title='Bestseller by Month in 2019 ');

In [None]:
Month_2020=Month_books.sort_values(by=['counts'],ascending=False)
Month_2020=Month_2020[Month_2020['Year']==2020]
Range=  pd.Series(range(1,13))
Top_Month=[]
Top_Book=[]
Top_counts=[]
for x in Range:
    Book=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Book'].values
    Month=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Month'].values
    Counts=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['counts'].values
    Top_Book.append(Book)
    Top_Month.append(Month)
    Top_counts.append(Counts)
Top_Month=[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
Top_Book=np.transpose(Top_Book)
Top_counts=np.transpose(Top_counts)
Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(),
                   columns=['Month','Book','Sale'])
Total

In [None]:
ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h'
                ,dodge=False,alpha=0.9)
# ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90)
ax4.set_yticklabels([ "C++",'Data Science','Justju ka safar-1','Artificial Intelligence',
                     'Product Management','Python Programming','Earn Money From Internet'])
sns.despine(bottom=True)

plt.setp(f.axes, yticks=[]);
ax4.set(title='Bestseller by Month in 2020 ');

In [None]:
Month_2021=Month_books.sort_values(by=['counts'],ascending=False)
Month_2021=Month_2021[Month_2021['Year']==2021]
Total=Month_2021[Month_2021['Month']==1].nlargest(1,'counts')
Total

# Work in progress 👨‍🔧 and if you like my work do "up vote" ☝