# Pakistan's Ecommerce Explained

This is the largest retail e-commerce orders dataset from Pakistan. It contains half a million transaction records from March 2016 to August 2018. The data was collected from various e-commerce merchants as part of a research study.
There is a dire need for such dataset to learn about Pakistan’s emerging e-commerce potential and I hope this will help many startups in many ways.

# Setting Our Envoirnment

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import sys
plt.rcParams['figure.figsize']=(12,5)
warnings.filterwarnings("ignore") # For ingnoring warnings
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', None,'display.max_columns', None,'display.width', None,'display.max_colwidth', -1)
import plotly.express as px
import plotly.graph_objs as go
import plotly as py
from plotly import tools
from plotly.offline import iplot
%matplotlib inline

# Loading Our Data

In [None]:
df = pd.read_csv("/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv", parse_dates=["created_at","Working Date"], low_memory=False)
df.head()

#### Shape and Size

In [None]:
print(df.shape)
print(df.size)

In [None]:
print("Columns:", df.columns)
print("Info:", df.info())

**Dropping Last 5 columns and na values in all rows**

In [None]:
df = df.iloc[:,:-5]
df = df.dropna(axis=0, how="all")

In [None]:
df.shape

**Renaming column because of space**

In [None]:
df.rename(columns={" MV ": "MV"}, inplace=True)
df.columns

#### Changing Data Types

In [None]:
df["item_id"] = df["item_id"].astype(str)
df["qty_ordered"] = df["qty_ordered"].astype(int)
#df["MV"] = df["MV"].astype(float)
df["Year"] = df["Year"].astype(int)
df["Month"] = df["Month"].astype(int)
df["Customer ID"] = df["Customer ID"].astype(str)

In [None]:
df.head()

## Let's review data 

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df = df.sort_values('created_at')#.set_index('created_at')
df.head()

# Mobile and Tablet is the best selling Category

In [None]:
cat_total = df.groupby('category_name_1')['grand_total'].sum().sort_values(ascending=False)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(12,5))
ax = sns.barplot(x = cat_total.index, y= cat_total.values)
ax.set_xlabel('Category')
ax.set_ylabel('Sold')
ax.set_xticklabels(cat_total.index, rotation='vertical', fontsize=10)
plt.show()

# Order Status

In [None]:
ord_stat = df.groupby(['status'])['grand_total'].sum().sort_values(ascending=False)[:10]

In [None]:
plt.figure(figsize=(12,5))
ax = sns.barplot(x = ord_stat.index, y= ord_stat.values)
ax.set_xlabel('Payment Method')
ax.set_ylabel('Number of Orders')
ax.set_xticklabels(ord_stat.index, rotation='vertical', fontsize=10)
plt.show()

# Payment Methods 


In [None]:
pay_met = df.groupby(['payment_method'])['grand_total'].sum().sort_values(ascending=False)
fig = px.bar(pay_met, x=pay_met.index, y=pay_met.values, text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}',textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

# Top 5 Payment Methods Vs Completed

In [None]:
df2 = df[["payment_method","status"]]
df2 = df2[(df2["payment_method"]=="cod") & (df2["status"]=="complete")
          |(df2["payment_method"]=="Payaxis") & (df2["status"]=="complete")
          |(df2["payment_method"]=="Easypay") & (df2["status"]=="complete")
          |(df2["payment_method"]=="bankalfalah") & (df2["status"]=="complete")
          |(df2["payment_method"]=="easypay_voucher") & (df2["status"]=="complete")
          |(df2["payment_method"]=="jazz_voucher") & (df2["status"]=="complete")]
df_2= df2.payment_method.value_counts()

In [None]:
df_2.head()

In [None]:
fig = px.bar(df_2, x=df_2.index, y=df_2.values)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
# Selecting categorical data 
df3 = df[['status','payment_method']]
# Coverting to dummies
df_dummies = pd.get_dummies(df3) 

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_dummies.corr(),annot=False)

In [None]:
df2_dummies = pd.get_dummies(df2)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df2_dummies.corr(),annot=True)

# No. of Orders per Month-Year

In [None]:
df_month_year = df['M-Y'].value_counts().sort_values()

In [None]:
df_month_year

In [None]:
plt.figure(figsize=(12,15))
ax = sns.barplot(x =df_month_year.index,y=df_month_year.values)
ax.set_xlabel('Month_Year')
ax.set_ylabel('Number of Orders')
ax.set_xticklabels(labels=df_month_year.index, rotation = 'vertical', fontsize=10)
plt.show()

### To be Continued...

You can fork this kernel and continue your analysis.


If you find it helpful, please **upvote!!!**