#      Kindly hold on it will take few seconds to load full Notebook



# Data Description

This is the largest retail e-commerce orders dataset from Pakistan. It contains half a million transaction records from March 2016 to August 2018. The data was collected from various e-commerce merchants as part of a research study.

Geography: Pakistan

Time period: 03/2016 – 08/2018

Unit of analysis: E-Commerce Orders

Dataset: The dataset contains detailed information of half a million e-commerce orders in Pakistan from March 2016 to August 2018. It contains item details, shipping method, payment method like credit card, Easy-Paisa, Jazz-Cash, cash-on-delivery, product categories like fashion, mobile, electronics, appliance etc., date of order, SKU, price, quantity, total and customer ID. This is the most detailed dataset about e-commerce in Pakistan that you can find in the Public domain.

Variables: The dataset contains Item ID, Order Status (Completed, Cancelled, Refund), Date of Order, SKU, Price, Quantity, Grand Total, Category, Payment Method and Customer ID.
Size: 101 MB
File Type: CSV


**Please upvote if you find this notebook helpful! 😊 Thank you! I would also be very happy to receive feedback on my work.**

In [None]:
import numpy as np
import pandas as pd
# plotting stuff
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
colorMap = sns.light_palette("blue", as_cmap=True)
import datatable as dt
# misc
import missingno as msno
# system
import warnings
warnings.filterwarnings('ignore')
# garbage collector to keep RAM in check
import gc  
import matplotlib.gridspec as gridspec
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Overview

In [None]:
!wc -l ../input/pakistans-largest-ecommerce-dataset/*.csv

In [None]:
df = dt.fread('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv').to_pandas()
df.head(10)

In [None]:
df.info()

In [None]:
# Number of rows and columns in the Data set
df.shape

In [None]:
# check the rows have null values in dataset
df[pd.isnull(df).any(axis=1)]

In [None]:
df[df.duplicated(['item_id'])].head(5)

There are too many NaN values

In [None]:
print("Unique Status : ")
print(df['status'].unique())
print("Unique Status Value Count: ")
print(df['status'].value_counts())

In [None]:
df.loc[df['status'] == "",'status'] = "unknown"

In [None]:
#print(df[df['status'] == ""]["status"].value_counts())

# Order Status

In [None]:
px.histogram(df, x = 'status', width = 800, height = 500, title = 'Frequency of Order Status')

**There is big number 464066 of order status unknown.**

In [None]:
fig = px.histogram(df, x="status", color="category_name_1")
fig.update_layout(title_text='Order status with Respect to category name',
                   xaxis_title_text='Order Status', 
    yaxis_title_text='Count')
fig.show()

In [None]:
n = df.groupby(['status'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='status', text='grand_total',color ="status")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()


In [None]:
n = df.groupby(['created_at' ,'status'])['grand_total'].sum().reset_index()
px.box(n, y="grand_total", color = "status")

# Category Name

In [None]:
df.loc[df['category_name_1'] == "",'category_name_1'] = "unknown"

In [None]:
px.histogram(df, x = 'category_name_1', width = 800, height = 500, title = 'Frequency of category name')

In [None]:
n = df.groupby(['category_name_1'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='category_name_1', text='grand_total',color='category_name_1')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()


In [None]:
n = df.groupby(['created_at' ,'category_name_1'])['grand_total'].sum().reset_index()
px.box(n, y="grand_total", color = "category_name_1")

# Payment Method

In [None]:
px.histogram(df, x = 'payment_method', width = 800, height = 500, title = 'Frequency of payment method')

In [None]:
n = df.groupby(['payment_method'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='payment_method', text='grand_total',color ="payment_method")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()


In [None]:
fig = px.histogram(df, x="status", color="payment_method")
fig.update_layout(title_text='Order status with Respect to payment_method',
                   xaxis_title_text='Order Status',     yaxis_title_text='Count')
fig.show()




In [None]:
n = df.groupby(['Year' ,'status'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="status", title="Graand Total w.r.t status and year")
fig.show()

In [None]:
n = df.groupby(['Year' ,'payment_method'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="payment_method", title="Graand Total w.r.t payment_method and year")
fig.show()

In [None]:
fig = go.Figure(px.funnel_area(names=df["status"],values=df["grand_total"]))
fig.show()

In [None]:
label = ['Year',
         '2016','2017','2018',
         'Status', 'Status', 'Status',
         'canceled', 'closed', 'cod', 'complete', 'exchange', 'fraud', 'holded', 'order_refunded', 'paid', 'payment_review', 'pending', 'pending_paypal', 'processing', 'received', 'refund',
         'canceled', 'closed', 'cod', 'complete', 'exchange', 'fraud', 'holded', 'order_refunded', 'paid', 'payment_review', 'pending', 'pending_paypal', 'processing', 'received', 'refund',
         'canceled', 'closed', 'cod', 'complete', 'exchange', 'fraud', 'holded', 'order_refunded', 'paid', 'payment_review', 'pending', 'pending_paypal', 'processing', 'received', 'refund'
         ]


source = [0, 0, 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,] 

target = [1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,
         37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]

value  = [1, 1, 1, 1, 1, 1, 39264,212,0,74610,4,7,6,14053,0,0,0,0,0,2885,2868,94981,186,1621,123489,0,3,0,32541,599,0,0,7,0,34119,3370,
         66896,96,1234,35585,0,0,25,12911,560,57,48,0,33,40281,1790] 


In [None]:
# data to dict, dict to sankey
link = dict(source = source, target = target, value = value)
node = dict(label = label, pad=50, thickness=5)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()

**This is first version of EDA. Other modeling Work in progress and if you like my work do "up vote"**