# Order Trend Analysis

## Purpose
Finding insight on :
- **What the trend of sales over the time?**
- **Is there any growing trend on the e-commerce?**
- **On what day of week do the users tend to create order / purchasing item?**
- **On what hour/time of the day do the users tend to create order / purchasing item?**


## Background
In ecommerce, finding sales trend is one of the most important aspect in ecommerce. As one of the main business process e-commerce can offer, we should know about the trend for the ecommerce, wether growing trend, or anything else. The insight that we get from understanding what is the trend and when the people most likely to purchase (peak season) will be able to increase the number of sales in the future

## In General
Sebelum itu, saya ingin membahas terlebih dahulu trend peningkatan order secara general dari e-commerce yang sedang berkembang. Umumnya peningkatan order untuk ecommerce yang sedang berkembang tentunya akan naik dengan perlahan. Namun terkadang, terdapat masa - masa tertentu yang sering dikenal sebagai “Peak Season” dimana tingkat order menaik secara tiba - tiba untuk interval waktu tertentu dan kembali normal. Mengenal lebih jauh terkait pola - pola ini dapat mempersiapkan ecommerce untuk menanggapi demand yang naik turun.


## Sanity Check

In [None]:
def single_countplot(df, ax, x=None, y=None, top=None, order=True, hue=False, palette='plasma',
                     width=0.75, sub_width=0.3, sub_size=12):

    ncount = len(df)
    if x:
        col = x
    else:
        col = y

    # Verificando a plotagem de top categorias
    if top is not None:
        cat_count = df[col].value_counts()
        top_categories = cat_count[:top].index
        df = df[df[col].isin(top_categories)]

    # Validando demais argumentos e plotando gráfico
    if hue != False:
        if order:
            sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index, hue=hue)
        else:
            sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, hue=hue)
    else:
        if order:
            sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index)
        else:
            sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax)

    format_spines(ax, right_border=False)

    if x:
        for p in ax.patches:
            x = p.get_bbox().get_points()[:, 0]
            y = p.get_bbox().get_points()[1, 1]
            ax.annotate('{}\n{:.1f}%'.format(int(y), 100. * y / ncount), (x.mean(), y), ha='center', va='bottom')
    else:
        for p in ax.patches:
            x = p.get_bbox().get_points()[1, 0]
            y = p.get_bbox().get_points()[:, 1]
            ax.annotate('{} ({:.1f}%)'.format(int(x), 100. * x / ncount), (x, y.mean()), va='center')

def format_spines(ax, right_border=True):
    """
    This function sets up borders from an axis and personalize colors

    Input:
        Axis and a flag for deciding or not to plot the right border
    Returns:
        Plot configuration
    """
    # Setting up colors
    ax.spines['bottom'].set_color('#CCCCCC')
    ax.spines['left'].set_color('#CCCCCC')
    ax.spines['top'].set_visible(False)
    if right_border:
        ax.spines['right'].set_color('#CCCCCC')
    else:
        ax.spines['right'].set_color('#FFFFFF')
    ax.patch.set_facecolor('#FFFFFF')

In [None]:
import json

def load_config(file_path: str = "./config.json"):
    with open(file_path) as config_file:
        data = json.load(config_file)
    return data

config = load_config("../config.json")
DBNAME = config.get("DBNAME")
HOSTNAME = config.get("HOSTNAME")
USER = config.get("USER")
PASS = config.get("PASS")
SCHEMA = config.get("SCHEMA")

In [None]:
# Basic 
import sys
import numpy as np
import scipy as sp
import pandas as pd

# SQL Engine
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Profiling process
from tqdm import tqdm

# Warning problems in notebook
import warnings
warnings.filterwarnings('ignore')

# Visualization
import bamboolib as bam
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from matplotlib.gridspec import GridSpec

# Reporting result
import sweetviz as sv
from dataprep.eda import create_report

In [None]:
def handle_missing_mart(df):
    df['product_category'].fillna("Other", inplace=True)
    payment_val = (44990.0 + 2830.0) * 3
    df.loc[df.total_payment_value.isna(), 'total_payment_value'] = payment_val
    
    return df

In [None]:
# Load data

# Create an engine instance
alchemyEngine = create_engine(
    f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600)

# Connect to PostgreSQL server
conn = alchemyEngine.connect()

schema = SCHEMA

In [None]:
# Init needed data

QUERY = """
select 
	foi.order_id ,
	MIN(u.user_name) as user_name,
	MIN(foi.total_payment_value) as order_value,
	MIN(dd."date") as order_date,
	MIN(dd.day_of_year) as order_day_of_year,
	MIN(dd.day_of_month) as order_day_of_month,
	MIN(dd.day_of_quarter) as order_day_of_quarter,
	MIN(dd.day_name) as order_day_name,
	MIN(dd.day_of_week) as order_day_of_week,
	MIN(dd.month_actual) as order_month,
	MIN(dd.month_name) as order_month_name,
	MIN(dd.week_of_month) as order_week_of_month,
	MIN(dd."year") as order_year,
	case when MIN(dd."isWeekend"::int) = 1 then true else false end as order_is_weekend ,
	MIN(dt."hour") as order_time
from staging.fct_order_items foi
left outer join staging.dim_date dd on foi.order_date = dd.date_id 
left outer join staging.dim_time dt on foi.order_time = dt.time_id 
left outer join (
	select 
		du.user_key ,
		du.user_name
	from staging.dim_user du 
	where du.is_current_version=true
) u on foi.user_key = u.user_key
group by foi.order_id;
"""

# Init dataframe
df = pd.read_sql_query(QUERY, conn)
df

## Transaction Value by Year

In [None]:
data = df.groupby(['order_year']).agg(value=('order_value', 'sum'), num_of_transaction=('order_id', 'count')).reset_index()

In [None]:
data

In [None]:
fig = px.bar(data, x='order_year', y='value', template='simple_white', title='Order Value per Year')
fig.update_xaxes( type='category')
# To sorting : Uncomment below
# fig.update_xaxes(categoryorder='total descending')
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Total Purchase Value')
fig

In [None]:
fig = px.box(df, x="order_year", y="order_value", template='ggplot2', title='Box Plot of transactions over the year' )
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Total Value')
fig.show()

In [None]:
fig = px.bar(data, x='order_year', y='num_of_transaction', template='ggplot2', title='Order Count per Year')
fig.update_xaxes( type='category')
# To sorting : Uncomment below
# fig.update_xaxes(categoryorder='total descending')
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Total Purchase Count')
fig

In [None]:
order_grouped_year = df.groupby(['order_year', 'order_day_of_year']).agg(value=('order_value', 'sum'), num_of_transaction=('order_id', 'count')).reset_index()

---

Data provided only give us the information from Sept 2016 - Sept 2018. We could see that's why the number of order in 2016 so low. Meanwhile we have the full year data for 2017 year. For lack of entire data,we are unable to conclude any significant findings here.


---

In [None]:
df['order_year_month'] = df['order_date'].apply(lambda x: x.strftime('%Y%m'))
df

In [None]:
trend_date=df.groupby('order_date').agg({'order_id':'count', 'order_value': 'sum'}).rename(columns={'order_id':'freq_order', 'order_value' : 'total_value'}).reset_index()
trend_date

In [None]:
fig = px.line(trend_date, x='order_date', y='freq_order', title="Order Frequency over the date", template='ggplot2')
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    ),
    title_text='Date'
)
fig.update_yaxes(title_text='Total Purchase Count')
fig.show()

---

Some insight we can get:
- There are sales spike in 24 November 2017. Based on internet, this is the famous "Black Friday". Might be the cause of huge spike.
- Sales are weak after Dec 20. Possible end - year holidays (?)
- Data not complete in 2016? And in 2018. Only 2017 have full year data that we can analyse

---

In [None]:
fig = px.line(trend_date, x='order_date', y='total_value', title="Order Total Value over the date", template='ggplot2')
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    ),
    title_text='Date'
)
fig.update_yaxes(title_text='Total Purchase Value')
fig.show()

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(13, 10))

# Axis definition
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

sns.lineplot(data=df['order_year_month'].value_counts().sort_index(), ax=ax1, 
             color='darkslateblue', linewidth=2)
ax1.annotate(f'Highest orders \nreceived', (13, 7500), xytext=(-75, -25), 
             textcoords='offset points', bbox=dict(boxstyle="round4", fc="w", pad=.8),
             arrowprops=dict(arrowstyle='-|>', fc='w'), color='dimgrey', ha='center')
ax1.annotate(f'Noise on data \n(huge decrease)', (23, 0), xytext=(48, 25), 
             textcoords='offset points', bbox=dict(boxstyle="round4", fc="w", pad=.5),
             arrowprops=dict(arrowstyle='-|>', fc='w'), color='dimgrey', ha='center') 
format_spines(ax1, right_border=False)  
for tick in ax1.get_xticklabels():
    tick.set_rotation(45)
ax1.set_title('Evolution of Total Orders in Brazilian E-Commerce', size=14, color='dimgrey')
ax1.set_xlabel("Date")
ax1.set_ylabel("Order Count")
# Barchart - Total of orders by day of week
single_countplot(df, x='order_day_of_week', ax=ax2, order=False, palette='YlGnBu')
weekday_label = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ax2.set_xticklabels(weekday_label)
ax2.set_title('Total Orders by Day of Week', size=14, color='dimgrey', pad=20)
ax2.set_xlabel("Day of Week")
ax2.set_ylabel("Total Order")

# Barchart - Total of orders by time of the day ->  ganti jadi versi siang evening dkk aja
day_color_list = ['darkslateblue', 'deepskyblue', 'darkorange', 'purple']
single_countplot(df, x='order_month', ax=ax3, order=False, palette=day_color_list)
ax3.set_title('Total Orders by Month', size=14, color='dimgrey', pad=20)

plt.tight_layout()
plt.show()

By the chart above we can conclude:

E-commerce on Brazil really has a growing trend along the time. We can see some seasonality with peaks at specific months, but in general we can see clear that customers are more prone to buy things online than before.
Monday are the prefered day for brazilian's customers and they tend to buy more at afternoons.

Notes : There are some sharp decrease between August 2018 - Sept 2018, maybe because it is the cutoff from data.
Meanwhile the 2016 data is not compareable, maybe because the ecommerce only started??

If we want to compare the growth, it might be between 2017 & 2018 data (January - August)

---

Compare 2017 & 2018 

---

In [None]:
# Creating figure
fig = plt.figure(constrained_layout=True, figsize=(13, 5))

# Axis definition
gs = GridSpec(1, 3, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1:])

# Annotation - Grown on e-commerce orders between 2017 and 2018
df_compare = df.query('order_year in (2017, 2018) & order_month <= 8')
year_orders = df_compare['order_year'].value_counts()
growth = int(round(100 * (1 + year_orders[2017] / year_orders[2018]), 0))
ax1.text(0.00, 0.73, f'{year_orders[2017]}', fontsize=40, color='mediumseagreen', ha='center')
ax1.text(0.00, 0.64, 'orders registered in 2017\nbetween January and August', fontsize=10, ha='center')
ax1.text(0.00, 0.40, f'{year_orders[2018]}', fontsize=60, color='darkslateblue', ha='center')
ax1.text(0.00, 0.31, 'orders registered in 2018\nbetween January and August', fontsize=10, ha='center')
signal = '+' if growth > 0 else '-'
ax1.text(0.00, 0.20, f'{signal}{growth}%', fontsize=14, ha='center', color='white', style='italic', weight='bold',
         bbox=dict(facecolor='navy', alpha=0.5, pad=10, boxstyle='round, pad=.7'))
ax1.axis('off')

# Bar chart - Comparison between monthly sales between 2017 and 2018
single_countplot(df_compare, x='order_month', hue='order_year', ax=ax2, order=False,
                 palette='YlGnBu')
month_label = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']
ax2.set_xticklabels(month_label)
ax2.set_title('Total Orders Comparison Between 2017 and 2018 (January to August)', size=12, pad=20)
ax2.set_xlabel('Month')
ax2.set_ylabel('Order count')

plt.legend(loc='lower right')
plt.show()

---

Based on data above, we could see that the ecommerce is growing!!

---

In [None]:
trend_hour=df.groupby('order_time').agg({'order_id':'count', 'order_value': 'sum'}).rename(columns={'order_id':'freq_order', 'order_value' : 'total_value'}).reset_index()
trend_hour

In [None]:
fig = make_subplots(rows=2, cols=1,
                   subplot_titles=("Order Frequency over the hour", "Order Total Value over the hour"))

# Figure 1 : 
figure1 = px.bar(trend_hour, x='order_time', y='freq_order', template='ggplot2', title='Order Frequency over the hour')

# Figure 2 : 
figure2 = px.bar(trend_hour, x='order_time', y='total_value', template='ggplot2', title='Order Total Value over the hour')

# For as many traces that exist per Express figure, get the traces from each plot and store them in an array.
# This is essentially breaking down the Express fig into it's traces
figure1_traces = []
figure2_traces = []
for trace in range(len(figure1["data"])):
    figure1_traces.append(figure1["data"][trace])
for trace in range(len(figure2["data"])):
    figure2_traces.append(figure2["data"][trace])

# Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
for traces in figure1_traces:
    fig.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    fig.append_trace(traces, row=2, col=1)

fig.update_xaxes(title_text='Hour', type='category', row=1, col=1)
fig.update_xaxes(title_text='Hour', type='category', row=2, col=1)
fig.update_yaxes(title_text='Total Purchase Count', row=1, col=1)
fig.update_yaxes(title_text='Total Purchase Value', row=2, col=1)

fig.update_layout(title_text="Order Hour Trend", height=800)

fig.show()


---

We could see that most people buy / purchase using ecommerce in 11 - 21 o'clock

---

In [None]:
df['weekday'] = df.apply(
    lambda x: x['order_day_name'].replace('day', '')[:3],
    axis=1
)
df['weekday']=pd.Categorical(df['weekday'],categories=['Sun','Mon','Tue','Wed','Thu','Fri','Sat'],ordered=True)

day_hour=df.groupby(['weekday','order_time']).agg({'order_id':'count'}).rename(columns={'order_id':'freq'}).reset_index()

day_hour = day_hour.pivot('weekday','order_time','freq')
plt.figure(figsize=(15,8))
ax=sns.heatmap(day_hour,annot=True,fmt="d",cmap="OrRd")
ax.set_xlabel("Hour")
ax.set_ylabel("Day")
ax.set_title("Heatmap Transaction over the hour by day",size=20)

## Insight

## Recommendation