In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
import datetime as dt

In [3]:
import cx_Oracle as orc
import os
from sqlalchemy import types, create_engine

### Connect to Oracle database AWE - UKSR1_ANONYMIZED to fetch required data

In [4]:
# Make DSN address
dsn = orc.makedsn('hostname', 'port', service_name='servicename')
# Connect to DSN
conn = orc.connect(user='username', password='password', dsn=dsn)

In [5]:
sql_query = """SELECT 
  EMAIL AS CUSTOMER_ID,
  ORDER_NUMBER,
  DELIVERY_WAY,
  TOT_ORDER_VALUE,
  TRUNC(ORDER_DATE) AS ORDER_DATE
FROM
  "PRD_ODI_AWE"."UKSR1_ANONYMIZED"
WHERE
  ORDER_DATE >= TO_DATE('01.09.16', 'DD.MM.YY')"""

In [6]:
original_df = pd.read_sql(sql_query, conn)

In [7]:
df1 = original_df.copy()
df1.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE
0,1DDBBB23CF7FBFC6E6C01417D760BFD5F70BC942,1028888157,PARCEL,37.35,2019-04-26
1,C20FAEFB67F04E93E1E62520073815351CCD6803,1027897152,PARCEL,90.41,2019-04-26
2,5A52B990712FBBB3AF1147EAAE119521D992692F,1029271275,PARCEL,83.88,2019-04-28
3,B9766B56E1D8502843A6C160787BF2F47BDCB95A,1029297717,PARCEL,83.86,2019-04-29
4,CADEA044A42B1499BE2D7FDF2B172D963CE06D0B,1029326497,TRUCK,365.94,2019-04-29


In [None]:
df1.info()

In [9]:
df1.isnull().sum()

CUSTOMER_ID         3114
ORDER_NUMBER           0
DELIVERY_WAY       50840
TOT_ORDER_VALUE        0
ORDER_DATE             0
dtype: int64

### Remove data with null customer IDs

In [10]:
df1 = df1.dropna(subset=['CUSTOMER_ID'])

### Remove orders within 30 days

In [11]:
# For customers buying "within the next 30 days” only the first purchase along with sales value is considered 
df2 = df1.copy()

In [12]:
df2 = df2.sort_values(['CUSTOMER_ID', 'ORDER_DATE'])

In [13]:
df2['CUSTOMER_ID_COPY'] = df2['CUSTOMER_ID']
df2['ORDER_DATE_COPY'] = df2['ORDER_DATE']
df2['CUSTOMER_ID_COPY'] = df2['CUSTOMER_ID'].shift(1)
df2['ORDER_DATE_COPY'] = df2['ORDER_DATE'].shift(1)

In [14]:
df2['DATE_DIFF'] = 5000
df2.loc[df2['CUSTOMER_ID'] == df2['CUSTOMER_ID_COPY'], 'DATE_DIFF'] = (df2['ORDER_DATE'] - df2['ORDER_DATE_COPY']).dt.days

In [15]:
df2 = df2[df2['DATE_DIFF'] > 30]

In [16]:
df2 = df2.drop(['CUSTOMER_ID_COPY', 'ORDER_DATE_COPY', 'DATE_DIFF'], axis=1)

In [None]:
df2['FY'] = 0
df2.loc[(df2['ORDER_DATE'] >= '2016-09-01') & (df2['ORDER_DATE'] < '2017-09-01'), 'FY'] = 'FY17'
df2.loc[(df2['ORDER_DATE'] >= '2017-09-01') & (df2['ORDER_DATE'] < '2018-09-01'), 'FY'] = 'FY18'
df2.loc[(df2['ORDER_DATE'] >= '2018-09-01') & (df2['ORDER_DATE'] < '2019-09-01'), 'FY'] = 'FY19'
df2.loc[(df2['ORDER_DATE'] >= '2019-09-01') & (df2['ORDER_DATE'] < '2020-09-01'), 'FY'] = 'FY20'

### Cohort analysis - creating cohorts based on order period

In [17]:
df3 = df2.copy()
df3.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24


In [None]:
df3['DELIVERY_WAY'].value_counts()

In [None]:
df3.info()

In [20]:
# Create a order period column based on the order date
df3['ORDER_PERIOD'] = df3.ORDER_DATE.apply(lambda x: x.strftime('%Y-%m'))
df3.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,ORDER_PERIOD
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,2018-04
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,2018-07
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,2018-11
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,2019-02
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,2017-09


In [21]:
# Determine the user's cohort group i.e. order period of their first order
df3.set_index(['CUSTOMER_ID'], inplace=True)

df3['COHORT_GROUP'] = df3.groupby(level=0)['ORDER_DATE'].min().apply(lambda x: x.strftime('%Y-%m'))
df3.reset_index(inplace=True)
df3.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,ORDER_PERIOD,COHORT_GROUP
0,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,2018-04,2018-04
1,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,2018-07,2018-04
2,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,2018-11,2018-04
3,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,2019-02,2018-04
4,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,2017-09,2017-09


In [30]:
df4 = df3.copy()

In [31]:
df4['DELIVERY_WAY_TRUCK'] = df4['DELIVERY_WAY'].copy()
df4['DELIVERY_WAY_PARCEL'] = df4['DELIVERY_WAY'].copy()
df4.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,ORDER_PERIOD,COHORT_GROUP,DELIVERY_WAY_TRUCK,DELIVERY_WAY_PARCEL
0,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,2018-04,2018-04,TRUCK,TRUCK
1,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,2018-07,2018-04,PARCEL,PARCEL
2,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,2018-11,2018-04,PARCEL,PARCEL
3,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,2019-02,2018-04,PARCEL,PARCEL
4,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,2017-09,2017-09,PARCEL,PARCEL


In [32]:
df4.loc[((df4['DELIVERY_WAY_TRUCK'] != 'TRUCK') & (df4['DELIVERY_WAY_TRUCK'] != 'MIXED ORDER')), 'DELIVERY_WAY_TRUCK'] = np.nan
df4.loc[df4['DELIVERY_WAY_PARCEL'] != 'PARCEL', 'DELIVERY_WAY_PARCEL'] = np.nan
df4.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,ORDER_PERIOD,COHORT_GROUP,DELIVERY_WAY_TRUCK,DELIVERY_WAY_PARCEL
0,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,2018-04,2018-04,TRUCK,
1,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,2018-07,2018-04,,PARCEL
2,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,2018-11,2018-04,,PARCEL
3,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,2019-02,2018-04,,PARCEL
4,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,2017-09,2017-09,,PARCEL


In [33]:
df4['AVG_ORDER_VALUE'] = df4['TOT_ORDER_VALUE'].copy()

In [None]:
grouped = df4.groupby(['COHORT_GROUP', 'ORDER_PERIOD'])

# Count of unique customers, orders, and total sales per cohort Group and order period
cohorts = grouped.agg({'CUSTOMER_ID': pd.Series.nunique,
                       'ORDER_NUMBER': pd.Series.nunique,
                       'DELIVERY_WAY_TRUCK': pd.Series.count,
                       'DELIVERY_WAY_PARCEL': pd.Series.count,
                       'TOT_ORDER_VALUE': np.sum,
                       'AVG_ORDER_VALUE': np.average})

# Rename the columns first
cohorts.rename(columns={'CUSTOMER_ID': 'TOTAL_CUSTOMERS',
                        'ORDER_NUMBER': 'TOTAL_ORDERS'}, inplace=True)
cohorts.head()

In [None]:
def cohort_period(df):
    df['COHORT_PERIOD'] = np.arange(len(df)) + 1
    return df

cohorts = cohorts.groupby(level=0).apply(cohort_period)
cohorts.head()

## Customer retention - numbers per cohort group

In [None]:
# reindex the DataFrame
cohorts.reset_index(inplace=True)
cohorts.set_index(['COHORT_GROUP', 'COHORT_PERIOD'], inplace=True)

# create a Series holding the total size of each CohortGroup
cohort_group_size = cohorts['TOTAL_CUSTOMERS'].groupby(level=0).first()
cohort_group_size.head()

In [None]:
cust_retention_abs = cohorts['TOTAL_CUSTOMERS'].unstack(0)
cust_retention_abs.head()

In [None]:
cust_retention = cohorts['TOTAL_CUSTOMERS'].unstack(0).divide(cohort_group_size, axis=1)
cust_retention.head()

In [None]:
cust_retention_abs[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
cust_retention[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
cust_retention[['2018-10', '2018-11', '2018-12', '2019-01']].plot(figsize=(16,8))
plt.title('Cohorts: Customer Retention')
plt.xticks(np.arange(1, 12.1, 1))
plt.xlim(1, 12)
plt.ylabel('% of Cohort Purchasing');

In [None]:
cust_retention[['2018-10', '2018-11', '2018-12', '2019-01']].plot(figsize=(16,8))
plt.title('Cohorts: Customer Retention')
plt.xticks(np.arange(1, 12.1, 1))
plt.xlim(1, 12)
plt.ylim(0, 0.05)
plt.ylabel('% of Cohort Purchasing');

In [None]:
cust_retention[['2016-10']].plot(figsize=(16,8))
plt.title('Cohorts: Customer Retention')
plt.xticks(np.arange(1, 40.1, 1))
plt.xlim(1, 40)
plt.ylabel('% of Cohort Purchasing');

In [None]:
sns.set(style='white')

plt.figure(figsize=(24, 6))
plt.title('Cohorts: Customer Retention')
sns.heatmap(cust_retention[['2018-10', '2018-11', '2018-12', '2019-01']].T, mask=cust_retention[['2018-10', '2018-11', '2018-12', '2019-01']].T.isnull(), annot=True, fmt='.0%');

In [None]:
cust_retention2 = cust_retention[['2016-10', '2016-11', '2016-12', '2017-01', '2017-10', '2017-11', '2017-12', '2018-01', '2018-10', '2018-11', '2018-12', '2019-01']]
cust_retention2.head()

In [None]:
cust_retention2.info()

In [None]:
cust_retention2 = cust_retention2.reset_index()

cust_retention3 = cust_retention2[cust_retention2['COHORT_PERIOD']<=8]
cust_retention3.head(15)

In [None]:
cust_retention3.loc['SummedUp'] = cust_retention3.sum() - 1
cust_retention3.head(10)

In [None]:
cust_retention4 = cust_retention3[(cust_retention3['COHORT_PERIOD']<2) | (cust_retention3['COHORT_PERIOD']>9)]

In [None]:
cust_retention4 = cust_retention4[['COHORT_PERIOD', '2016-10', '2016-11', '2016-12', '2017-01', '2017-10', '2017-11', '2017-12', '2018-01', '2018-10', '2018-11', '2018-12', '2019-01']]
cust_retention4.head()

In [None]:
cust_retention4.T.head()

In [None]:
cust_retention4 = cust_retention4.drop('COHORT_PERIOD', axis=1)

In [202]:
cust_retention5 = cust_retention4.T
cust_retention5.head()

In [None]:
cust_retention5.rename(columns={0: 'TOTAL_CUSTOMERS',
                        'SummedUp': 'REPEAT_CUSTOMERS'}, inplace=True)

cust_retention5.head(15)

## Customer retention - sales per cohort group

In [None]:
# reindex the DataFrame
cohorts.reset_index(inplace=True)
cohorts.set_index(['COHORT_GROUP', 'COHORT_PERIOD'], inplace=True)

# create a Series holding the total size of each CohortGroup
cohort_group_size2 = cohorts['TOT_ORDER_VALUE'].groupby(level=0).first()
cohort_group_size2.head()

In [None]:
sales_retention_abs = cohorts['TOT_ORDER_VALUE'].unstack(0)
sales_retention_abs.head()

In [None]:
sales_retention = cohorts['TOT_ORDER_VALUE'].unstack(0).divide(cohort_group_size2, axis=1)
sales_retention.head()

In [None]:
sales_retention_abs[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
sales_retention[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
sales_retention[['2018-10', '2018-11', '2018-12', '2019-01']].plot(figsize=(16,8))
plt.title('Cohorts: Sales Retention')
plt.xticks(np.arange(1, 12.1, 1))
plt.xlim(1, 12)
plt.ylabel('% of Cohort Purchasing');

In [None]:
sales_retention[['2016-12', '2017-12', '2018-12']].plot(figsize=(16,8))
plt.title('Cohorts: Sales Retention')
plt.xticks(np.arange(1, 40.1, 1))
plt.xlim(2, 40)
plt.ylim(0, 0.1)
plt.ylabel('% of Cohort Purchasing');

In [None]:
sns.set(style='white')

plt.figure(figsize=(24, 6))
plt.title('Cohorts: Sales Retention')
sns.heatmap(sales_retention[['2018-10', '2018-11', '2018-12', '2019-01']].T, mask=sales_retention[['2018-10', '2018-11', '2018-12', '2019-01']].T.isnull(), annot=True, fmt='.0%');

In [None]:
df_sales = sales_retention_abs[['2016-10', '2016-11', '2016-12', '2017-01', '2017-10', '2017-11', '2017-12', '2018-01', '2018-10', '2018-11', '2018-12', '2019-01']]
df_sales.head()

In [None]:
df_sales.reset_index(inplace=True)

df_sales2 = df_sales[df_sales['COHORT_PERIOD']<=8]
df_sales2.head(15)

In [None]:
df_sales2.loc['SummedUp'] = df_sales2[1:8].sum()
df_sales2.head(10)

In [162]:
df_sales3 = df_sales2[(df_sales2['COHORT_PERIOD']<2) | (df_sales2['COHORT_PERIOD']>9)]

In [163]:
df_sales3 = df_sales3.drop('COHORT_PERIOD', axis=1)

In [164]:
df_sales4 = df_sales3.T

In [None]:
df_sales4.rename(columns={0: 'TOTAL_CUSTOMERS',
                        'SummedUp': 'REPEAT_CUSTOMERS'}, inplace=True)

df_sales4.head(15)

# Delivery way pattern - Truck vs. Parcel 

In [None]:
# reindex the DataFrame
cohorts.reset_index(inplace=True)
cohorts.set_index(['COHORT_GROUP', 'COHORT_PERIOD'], inplace=True)

# create a Series holding the total size of each CohortGroup
cohort_group_size3 = cohorts['DELIVERY_WAY_TRUCK'].groupby(level=0).first()
cohort_group_size3.head()

In [None]:
truck_retention_abs = cohorts['DELIVERY_WAY_TRUCK'].unstack(0)
truck_retention_abs.head()

In [None]:
truck_retention = cohorts['DELIVERY_WAY_TRUCK'].unstack(0).divide(cohort_group_size3, axis=1)
truck_retention.head()

In [None]:
truck_retention_abs[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
truck_retention[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
# reindex the DataFrame
cohorts.reset_index(inplace=True)
cohorts.set_index(['COHORT_GROUP', 'COHORT_PERIOD'], inplace=True)

# create a Series holding the total size of each CohortGroup
cohort_group_size4 = cohorts['DELIVERY_WAY_PARCEL'].groupby(level=0).first()
cohort_group_size4.head()

In [None]:
parcel_retention_abs = cohorts['DELIVERY_WAY_PARCEL'].unstack(0)
parcel_retention_abs.head()

In [None]:
parcel_retention = cohorts['DELIVERY_WAY_PARCEL'].unstack(0).divide(cohort_group_size4, axis=1)
parcel_retention.head()

In [None]:
parcel_retention_abs[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

In [None]:
parcel_retention[['2018-10', '2018-11', '2018-12', '2019-01']].head(15)

## Delivery way pattern - Truck vs. Parcel - II

In [90]:
df20 = df1.copy()
df20.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE
0,1DDBBB23CF7FBFC6E6C01417D760BFD5F70BC942,1028888157,PARCEL,37.35,2019-04-26
1,C20FAEFB67F04E93E1E62520073815351CCD6803,1027897152,PARCEL,90.41,2019-04-26
2,5A52B990712FBBB3AF1147EAAE119521D992692F,1029271275,PARCEL,83.88,2019-04-28
3,B9766B56E1D8502843A6C160787BF2F47BDCB95A,1029297717,PARCEL,83.86,2019-04-29
4,CADEA044A42B1499BE2D7FDF2B172D963CE06D0B,1029326497,TRUCK,365.94,2019-04-29


In [91]:
df20 = df20.sort_values(['CUSTOMER_ID', 'ORDER_DATE'])

In [92]:
df20['CUSTOMER_ID_COPY'] = df20['CUSTOMER_ID']
df20['ORDER_DATE_COPY'] = df20['ORDER_DATE']
df20['CUSTOMER_ID_COPY'] = df20['CUSTOMER_ID'].shift(1)
df20['ORDER_DATE_COPY'] = df20['ORDER_DATE'].shift(1)

In [93]:
df20['DATE_DIFF'] = 5000
df20.loc[df20['CUSTOMER_ID'] == df20['CUSTOMER_ID_COPY'], 'DATE_DIFF'] = (df20['ORDER_DATE'] - df20['ORDER_DATE_COPY']).dt.days

In [94]:
df20 = df20[df20['DATE_DIFF'] > 30]

In [95]:
df20.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,CUSTOMER_ID_COPY,ORDER_DATE_COPY,DATE_DIFF
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,,NaT,5000.0
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,2018-04-10,105.0
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,2018-07-24,126.0
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,2018-11-27,82.0
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,2019-02-17,5000.0


In [171]:
df21 = df20.drop(['CUSTOMER_ID_COPY', 'ORDER_DATE_COPY'], axis=1)

In [172]:
df21.loc[df21['DATE_DIFF']==5000, 'DATE_DIFF'] = np.nan

In [173]:
df21['REPEAT_TIME'] = df21.groupby('CUSTOMER_ID').cumcount() + 1

In [None]:
df21['REPEAT_TIME'].value_counts()

In [175]:
df21['REPEAT_CUST'] = 'Y'

In [176]:
df21.loc[df21['REPEAT_TIME']==1, 'REPEAT_CUST'] = 'N'

In [177]:
df21['REPEAT_CUST_PROSPECTIVE']= df21.duplicated(subset=['CUSTOMER_ID'], keep=False)

In [178]:
df22 = df21.copy()
df22.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,DATE_DIFF,REPEAT_TIME,REPEAT_CUST,REPEAT_CUST_PROSPECTIVE
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,,1,N,True
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,105.0,2,Y,True
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,126.0,3,Y,True
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,82.0,4,Y,True
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,,1,N,False


In [None]:
df22['DELIVERY_WAY'].value_counts()

In [None]:
df22['DELIVERY_WAY'].value_counts(normalize=True).plot.bar(title='Order Status')

In [None]:
df22.loc[df22['DELIVERY_WAY']=='MIXED ORDER', 'DELIVERY_WAY'] = 'TRUCK'
df22['DELIVERY_WAY'].value_counts(normalize=True).plot.bar(title='Delivery way share')

In [None]:
p1 = round((len(df22[(df22['REPEAT_TIME']==1) & (df22['DELIVERY_WAY']=='PARCEL')]) / len(df22)) * 100, 2)
t1 = round((len(df22[(df22['REPEAT_TIME']==1) & (df22['DELIVERY_WAY']=='TRUCK')]) / len(df22)) * 100, 2)
print("Perct. of parcel orders placed by first time customers: " + str(p1) + '%')
print("Perct. of truck orders placed by first time customers: " + str(t1) + '%')

In [None]:
p2 = round((len(df22[(df22['REPEAT_TIME']>1) & (df22['DELIVERY_WAY']=='PARCEL')]) / len(df22)) * 100, 2)
t2 = round((len(df22[(df22['REPEAT_TIME']>1) & (df22['DELIVERY_WAY']=='TRUCK')]) / len(df22)) * 100, 2)
print("Perct. of parcel orders placed by repeat customers: " + str(p2) + '%')
print("Perct. of truck orders placed by repeat customers: " + str(t2) + '%')

In [None]:
print("Perct. of parcel orders by customers making repeat purchase in comparison to first time purchase: " + str(round((p2/p1)*100, 2)) + '%')
print("Perct. of truck orders by customers making repeat purchase in comparison to first time purchase: " + str(round((t2/t1)*100, 2)) + '%')

In [None]:
df_parcel_first = df22[((df22['REPEAT_TIME']==1) & (df22['DELIVERY_WAY']=='PARCEL')) | (df22['REPEAT_TIME']>1)]
df_parcel_first.head()

In [None]:
df_truck_first = df22[((df22['REPEAT_TIME']==1) & (df22['DELIVERY_WAY']=='TRUCK')) | (df22['REPEAT_TIME']>1)]
df_truck_first.head()

In [None]:
print("Perct. of truck orders where first order is placed by parcel: " + str(round((len(df_parcel_first[df_parcel_first['DELIVERY_WAY']=='TRUCK']) / len(df_parcel_first))*100, 2)) + '%')
print("Perct. of parcel orders where first order is placed by truck: " + str(round((len(df_truck_first[df_truck_first['DELIVERY_WAY']=='PARCEL']) / len(df_truck_first))*100, 2)) + '%')

In [None]:
print("Perct. of truck orders in repeat purchase where first order is placed by parcel: " + str(round((len(df_parcel_first[df_parcel_first['DELIVERY_WAY']=='TRUCK']) / len(df_parcel_first[df_parcel_first['REPEAT_TIME']>1]))*100, 2)) + '%')
print("Perct. of parcel orders in repeat purchase where first order is placed by truck: " + str(round((len(df_truck_first[df_truck_first['DELIVERY_WAY']=='PARCEL']) / len(df_truck_first[df_truck_first['REPEAT_TIME']>1]))*100, 2)) + '%')

# Average Order Value (AOV)

In [179]:
df30 = df21.copy()
df30.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,DATE_DIFF,REPEAT_TIME,REPEAT_CUST,REPEAT_CUST_PROSPECTIVE
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,,1,N,True
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,105.0,2,Y,True
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,126.0,3,Y,True
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,82.0,4,Y,True
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,,1,N,False


In [None]:
# Average order value for first time purchase made by the customers
df31 = df30[df30['REPEAT_CUST']=='N']
aov1 = df31['TOT_ORDER_VALUE'].mean()
print("Average order value for first time purchase made by the customers: " + str(round(aov1, 2)) + '€')

In [None]:
# Average order value for repeat purchase made by the customers
df32 = df30[df30['REPEAT_CUST']=='Y']
aov2 = df32['TOT_ORDER_VALUE'].mean()
print("Average order value for repeat purchase made by the customers: " + str(round(aov2, 2)) + '€')

In [None]:
# Average order value for one-time customers
df33 = df30[df30['REPEAT_CUST_PROSPECTIVE']==False]
aov3 = df33['TOT_ORDER_VALUE'].mean()
print("Average order value for one-time customers: " + str(round(aov3, 2)) + '€')

In [None]:
# Average order value for repeat customers
df34 = df30[df30['REPEAT_CUST_PROSPECTIVE']==True]
aov4 = df34['TOT_ORDER_VALUE'].mean()
print("Average order value for repeat customers: " + str(round(aov4, 2)) + '€')

## Average time difference between repeat purchases

In [187]:
df40 = df21.copy()
df40.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,DATE_DIFF,REPEAT_TIME,REPEAT_CUST,REPEAT_CUST_PROSPECTIVE
3565529,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,948422457,TRUCK,216.0,2018-04-10,,1,N,True
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,105.0,2,Y,True
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,126.0,3,Y,True
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,82.0,4,Y,True
4242710,0000072CE05DB4ADC5F347E6BFCAF2D2D6812823,890878046,PARCEL,31.9,2017-09-24,,1,N,False


In [188]:
df41 = df40.dropna(subset=['DATE_DIFF'])
df41.head()

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,DELIVERY_WAY,TOT_ORDER_VALUE,ORDER_DATE,DATE_DIFF,REPEAT_TIME,REPEAT_CUST,REPEAT_CUST_PROSPECTIVE
2757213,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,984927381,PARCEL,53.88,2018-07-24,105.0,2,Y,True
3877881,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1000631129,PARCEL,8.89,2018-11-27,126.0,3,Y,True
1147355,000005FBDE326C41E4E704C8B358BF34E8EDFEA6,1014957404,PARCEL,72.86,2019-02-17,82.0,4,Y,True
4452703,0000079504A6804D92750DE675806B393B0E0B58,852444743,TRUCK,869.99,2017-05-18,89.0,2,Y,True
452948,000014CD9D292F6218080879889666A16E1EDA9F,1001214583,PARCEL,103.89,2018-11-22,442.0,2,Y,True


In [None]:
tdf = df41['DATE_DIFF'].mean()
print('Average time difference between repeat purchases: ' + str(int(tdf)) + ' days')