<img src = "https://cdn.dnaindia.com/sites/default/files/styles/full/public/2018/03/30/666686-online-shopping-tt.jpg" width="700" height="700">

<div class="alert alert-success" role="alert">
  <h4 class="alert-heading">Hi!</h4>
  <p>This Notebook is dedicated to performing Exploratory data Analysis of Online Shopping Data which i found on Google Datasets.</p>
  <hr>
  <p class="mb-0">Objective here to show EDA that is nice, tidy and informative.</p>
</div>

In [None]:
!pip install dexplot -q
!pip install altair -q
!pip install datasist -q

<div class="alert alert-success">  
<h3><strong>Imports</strong></h3>
</div>

In [None]:
import warnings
warnings.filterwarnings('ignore')

# for some basic operations
import numpy as np 
import pandas as pd 

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import plotting
from pandas_profiling import ProfileReport
from pandas.plotting import parallel_coordinates
import datasist as ds

# for interactive visualizations

import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
import dexplot as dxp

# for animated visualizations

import plotly_express as px
import altair as alt
alt.data_transformers.disable_max_rows()

# for providing path


import os
print(os.listdir("../input"))

# for modelling

import sklearn
import imblearn

# for model explanation
import shap 
import eli5

In [None]:
df = pd.read_csv('/kaggle/input/ecommerce-data/ecommerce.csv')

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [None]:
df.info()

<div class="alert alert-success">  
<h3><strong>Basic Data Crunches found</strong></h3>
</div>

In [None]:
ds.structdata.describe(df)

In [None]:
df = ds.feature_engineering.fill_missing_cats(df)
df = ds.feature_engineering.fill_missing_num(df)

<div class="alert alert-success">  
<h3><strong>Missing Values Info</strong></h3>
</div>

In [None]:
ds.structdata.display_missing(df)

In [None]:
df['sales'] = df['sales'].str.replace('[^\d\.]', '').astype(float)
df['profit'] = df['profit'].str.replace('[^\d\.]', '').astype(float)
df.drop(df.index[535],inplace=True)
df['shipping_cost'] = df['shipping_cost'].str.replace('$', '').astype(float)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,6))
df_ship_mode=df.ship_mode.value_counts().sort_values(ascending=False).to_frame()
ax=sns.barplot(data=df_ship_mode,x=df_ship_mode.index,y='ship_mode',palette="winter")
ax.set_title('Ship Mode Distribution')

ax.set_xlabel('Mode of Shipping')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)

<div class="alert alert-info">
  <p>Wow!!!! Standard class shipments counts are High as compared to others. This is obivious behaviour that most of shoppers adapt for standard devlivery instead of paid</p>
</div>

In [None]:
dxp.count('product_category',data = df,figsize=(10,5),cmap='viridis',title='Count for Product Category')

<div class="alert alert-info">
<p>Hmm Fashion Category has topped for most shopped one !!! Interesting😉  </p>
</div>

In [None]:
dxp.bar('ship_mode','sales',data = df,figsize=(10,5),aggfunc='median',split='product_category',title='Sales for Product Category')

<div class="alert alert-info">
<p>Its obvious that Standard class has most sales as compared to other ✔🤔 </p>
</div>

In [None]:
dxp.bar('product_category','sales',data = df,figsize=(10,5),aggfunc='mean',split='order_priority',title='Sales for Product Category')

<div class="alert alert-info">
  <p>Fashion has most sales in online shopping🐱‍🚀  </p>
</div>

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,6))
df_prod=df['product'].value_counts().sort_values(ascending=False).to_frame()[:20]
ax=sns.barplot(data=df_prod,x=df_prod.index,y='product',palette="winter_r")
ax.set_title('Product Sold in Online Sale')

ax.set_xlabel('Mode of Shipping')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)

<div class="alert alert-info">
  <p>This seems informative  Sports wear grabs a whopping sales 🏃‍♂️🏃‍♂️  </p>
</div>

In [None]:
grpcount = pd.DataFrame(df.groupby('country')['sales'].sum()).reset_index()
grpcount_ = grpcount.sort_values(by = 'sales', ascending = False)

fig = px.bar(grpcount_.iloc[:20],
             x = 'country', y = 'sales', title = 'Top 20 Countries based Sales')
fig.show()

<div class="alert alert-info">
  <p>United States got most of online shoppers followed by Ausses and France 🧐</p>
</div>

In [None]:
fig = px.choropleth(df, locations="country", color="sales", hover_name="country", animation_frame="months", range_color=[20,80],title = 'Animation Plot for World Sales')
fig.show()

In [None]:
grpcount = pd.DataFrame(df.groupby('state')['sales'].sum()).reset_index()
grpcount_ = grpcount.sort_values(by = 'sales', ascending = False)

fig = px.bar(grpcount_.iloc[:20], 
             x = 'state', y = 'sales', title = 'Top 20 states based Sales')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

<div class="alert alert-info">
  <p>California got most of online shoppers followed by England and NY 🧐</p>
</div>

In [None]:
grpcount = pd.DataFrame(df.groupby('customer_name')['sales'].sum()).reset_index()
grpcount_ = grpcount.sort_values(by = 'sales', ascending = False)

fig = px.bar(grpcount_.iloc[:20], 
             x = 'customer_name', y = 'sales', title = 'Top 20 Customers')
fig.show()

In [None]:
fig = px.pie(df, values='profit', names='region', title='Reigon wise Profit Earned')
fig.show()

<div class="alert alert-info">
  <p>Its good analysis that central region has higher profit in online sales of products🐱‍🏍 </p>
</div>

In [None]:
fig = px.sunburst(df, path=['country', 'state'], values='profit',color='product_category', hover_data=['country'],title = 'Sunburst')
fig.show()

<div class="alert alert-info">
    
  <p>Sunburst for Profit earned in countries and their respective cities 🤑</p>
</div>

In [None]:
fig = px.bar(df.groupby(['ship_mode']).count())

fig.show()

In [None]:
dxp.bar(x='region', y='sales', data=df, aggfunc='mean', split='segment',figsize=(15,5))

In [None]:
df.drop(df.index[321],inplace=True)
df['order_date'] = pd.to_datetime(df['order_date'])
df['ship_date'] = pd.to_datetime(df['ship_date'])
df['quantity'] = pd.to_numeric(df['quantity'])
    

df['lapsdays'] = (df['ship_date'] - df['order_date']).dt.days

In [None]:
dxp.bar(x='ship_mode', y='lapsdays', data=df, aggfunc='mean', split='order_priority',figsize=(15,5))

In [None]:
grpcount = pd.DataFrame(df.groupby('segment')['lapsdays'].sum()).reset_index()
grpcount_ = grpcount.sort_values(by = 'lapsdays', ascending = False)

labels = ['Consumer', 'Corporate','Home Office']
size = grpcount_['segment'].value_counts()
colors = ['green', 'lightskyblue','lightblue']
explode = [0, 0.1,0.1]

plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Segment Pie to Laps Days', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

<div class="alert alert-info">
  <p>Shipments got delayed for almost all segments of customers 😪</p>
</div>

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(df['sales'][df.order_priority == 'Medium'], shade=False)
sns.kdeplot(df['sales'][df.order_priority == 'Critical'], shade=True)
sns.kdeplot(df['sales'][df.order_priority == 'High'], shade=False)
plt.legend(['Medium', 'Critical','High'])
plt.title('Sales Distribution ')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(df['profit'][df.region == 'Central'], shade=True)
sns.kdeplot(df['profit'][df.region == 'North'], shade=False)
sns.kdeplot(df['profit'][df.region == 'South'], shade=True)
sns.kdeplot(df['profit'][df.region == 'East'], shade=False)
sns.kdeplot(df['profit'][df.region == 'West'], shade=True)

plt.legend(['Central', 'North','South','East','West'])
plt.title('Profit Distribution ')
plt.show()

In [None]:
dxp.line(x='segment', y='profit', data=df, aggfunc='mean', orientation='v',figsize=(15,5),title='Line Plot for Profit Earned Segment wise')

In [None]:
dxp.line(x='ship_mode', y='quantity', data=df, aggfunc='mean', split='segment',figsize=(15,5),title = 'Line Plot for Quantity vs Ship mode and Segment')

In [None]:
f,ax=plt.subplots(1,2,figsize=(20,10))
sns.boxenplot("ship_mode","profit", hue="segment", data=df,ax=ax[0])
ax[0].set_title('Segment and Profit vs Ship mode')
sns.boxenplot("ship_mode","sales", hue="segment", data=df,ax=ax[1])
ax[1].set_title('Segment and Sales vs Ship model')
plt.show()


In [None]:
axes = sns.relplot(x="months", y="sales", hue="product_category",
                size="quantity", data=df,palette='winter', sizes=(100, 300),alpha=0.5,aspect=2)
axes.set(xlabel='Months',title="Months wise Sales")

axes.set_xticklabels();

In [None]:
dxp.scatter(x='sales', y='profit', data=df, split='product_category', col='segment', col_order=['Consumer','Corporate'],title = 'Scatter for Profit and Sales')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,6))
XP=df.city.value_counts().sort_values(ascending=False).to_frame()[:10]
ax=sns.barplot(data=XP,x=XP.index,y='city')
ax.set_title('Top Cities with Most Online Customers')

ax.set_xlabel('City')
ax.set_ylabel('Counts')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '1.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)

<div class="alert alert-warning">
  <h3>Its Fun performing insights on this Data, Great Learning so far! Will continue to update this space. Thanks!!!! 🤗😀</h3>
</div>