## Lei Customer Analysis 

In [1]:
# General Dependencies

import numpy as np
import pandas as pd
import matplotlib as mpl
import pylab as pl

# Visualizations

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime as dt

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [2]:
# Read the data
df_data = pd.read_csv('data/data_ready.csv')
print('Dataframe dimensions:', df_data.shape)

Dataframe dimensions: (1781, 18)


In [3]:
df_data = df_data.dropna(how='all')
print('Dataframe dimensions:', df_data.shape)

Dataframe dimensions: (1752, 18)


In [4]:
df_data.head(10)

Unnamed: 0,Stock ID,Cost of Goods,Unit Price,Shipping Cost,Customer ID,Vendor,Order Status,Product Category,Brand,Product Description,Product Base Margin,Order Date,Profit,Quantity,Sales,Order ID,Month,Year
0,171003,76.0,99.0,15.0,VVIP1017,DIRECT,finished,Clothing,Madewell,Madewell Trevi Drapey Blazer Suit Black 2 Last...,8%,2017-01-01,8.0,1.0,99.0,2017013,1.0,2017.0
1,171004,87.0,120.0,22.5,L1631,DIRECT,finished,Shoes,UGG,UGG Australia – Joey Leather & Genuine Shearli...,9%,2017-01-01,10.5,1.0,120.0,2017014,1.0,2017.0
2,171005,87.0,120.0,22.5,L1631,DIRECT,finished,Shoes,UGG,Ugg red short paragraph 7,9%,2017-01-01,10.5,1.0,120.0,2017015,1.0,2017.0
3,171006,113.0,193.0,15.0,H1270,DIRECT,finished,Clothing,Icebreaker,Set of icebreaker ladies zipper blouse grey s ...,34%,2017-01-01,65.0,1.0,193.0,2017016,1.0,2017.0
4,171007,33.0,107.0,15.0,L1609,DIRECT,finished,Clothing,Anthropologies,Anthropological powder purple sweater s,55%,2017-01-01,59.0,1.0,107.0,2017017,1.0,2017.0
5,171008,33.0,107.0,15.0,L1367,DIRECT,finished,Clothing,Anthropologies,Anthropological powder purple sweater m short ...,55%,2017-01-01,59.0,1.0,107.0,2017018,1.0,2017.0
6,171009,33.0,107.0,15.0,VVIP1017,DIRECT,finished,Clothing,Anthropologies,Anthropological powder purple sweater s,55%,2017-01-01,59.0,1.0,107.0,2017019,1.0,2017.0
7,171010,36.0,64.0,7.5,L1126,DIRECT,finished,Clothing,Tory Burch,Tory sports white top s number,32%,2017-01-01,20.5,1.0,64.0,20170110,1.0,2017.0
8,171011,33.0,107.0,15.0,L1288,DIRECT,finished,Clothing,Anthropologies,Anthropological powder purple sweater s,55%,2017-01-01,59.0,1.0,107.0,20170111,1.0,2017.0
9,171012,31.0,80.0,7.5,L1126,DIRECT,finished,Clothing,Scotch Soda,Scotch soda camel hat,52%,2017-01-01,41.5,1.0,80.0,20170112,1.0,2017.0


## Building a customer table

Let's learn more about Lei customers.

In [5]:
# Frequency helps Lei to know how many times a customer purchased from her. To do get the Frequency we need to get 
# how many orders each customer has placed.

df_orders_customer = df_data.groupby(by='Customer ID', as_index=False)['Order ID'].count()
df_orders_customer.columns = ['Customer ID', 'Number Orders']

In [6]:
# Monetary responds How much money did the customer spent over time?

df_total_spend = df_data.groupby(by='Customer ID', as_index=False)['Sales'].sum()
df_total_spend.columns = ['Customer ID', 'Total Spent']

In [7]:
# How many times they bought

df_total_items = df_data.groupby(by='Customer ID', as_index=False)['Quantity'].sum()
df_total_items.columns = ['Customer ID', 'Total Items']

In [8]:
# when was their first order and how long ago was that from last order.

df_data['Order Date'] = pd.to_datetime(df_data['Order Date'])
dt_today = (df_data['Order Date']).max()

df_earliest_order = df_data.groupby(by='Customer ID', as_index=False)['Order Date'].min()
df_earliest_order.columns = ['Customer ID', 'Earliest Order']
df_earliest_order['Days as customer'] = 1 + (dt_today - df_earliest_order['Earliest Order']).astype('timedelta64[D]')

df_earliest_order.head()

Unnamed: 0,Customer ID,Earliest Order,Days as customer
0,DL1004,2018-02-01,29.0
1,DL1005,2017-03-01,366.0
2,DL1006,2017-06-01,274.0
3,DL1007,2017-01-01,425.0
4,DL1009,2017-05-01,305.0


In [9]:
# Recency  when was their last order 

df_last_order = df_data.groupby(by='Customer ID', as_index=False)['Order Date'].max()
df_last_order.columns = ['Customer ID', 'Last Purchase']

df_last_order['Days without purchase'] = 1 + (dt_today - df_last_order['Last Purchase']).astype('timedelta64[D]')
df_last_order.head()

Unnamed: 0,Customer ID,Last Purchase,Days without purchase
0,DL1004,2018-03-01,1.0
1,DL1005,2018-03-01,1.0
2,DL1006,2017-06-01,274.0
3,DL1007,2018-03-01,1.0
4,DL1009,2017-10-01,152.0


In [10]:
# Merge all the dataframes in one.

df_customers = df_orders_customer.merge(df_total_spend, on="Customer ID")
df_customers = df_customers.merge(df_total_items, on="Customer ID")
df_purchases = df_earliest_order.merge(df_last_order, on="Customer ID")
df_customers = df_customers.merge(df_purchases, on="Customer ID" )

#df_customers.set_index('Customer ID', inplace=True)
df_customers.head(10)

Unnamed: 0,Customer ID,Number Orders,Total Spent,Total Items,Earliest Order,Days as customer,Last Purchase,Days without purchase
0,DL1004,4,903.0,4.0,2018-02-01,29.0,2018-03-01,1.0
1,DL1005,9,2581.0,9.0,2017-03-01,366.0,2018-03-01,1.0
2,DL1006,5,200.0,5.0,2017-06-01,274.0,2017-06-01,274.0
3,DL1007,28,5824.0,28.0,2017-01-01,425.0,2018-03-01,1.0
4,DL1009,4,932.0,4.0,2017-05-01,305.0,2017-10-01,152.0
5,DL1011,1,398.0,1.0,2018-02-01,29.0,2018-02-01,29.0
6,DL1012,4,3598.0,15.0,2017-08-01,213.0,2018-03-01,1.0
7,DL1013,6,1131.0,6.0,2017-10-01,152.0,2018-01-01,60.0
8,DL1014,3,1165.0,3.0,2018-02-01,29.0,2018-03-01,1.0
9,DL1015,3,963.0,3.0,2018-01-01,60.0,2018-02-01,29.0


In [11]:
df_rfm = df_customers.drop(['Earliest Order', 'Last Purchase'], axis=1)

df_rfm.head()

Unnamed: 0,Customer ID,Number Orders,Total Spent,Total Items,Days as customer,Days without purchase
0,DL1004,4,903.0,4.0,29.0,1.0
1,DL1005,9,2581.0,9.0,366.0,1.0
2,DL1006,5,200.0,5.0,274.0,274.0
3,DL1007,28,5824.0,28.0,425.0,1.0
4,DL1009,4,932.0,4.0,305.0,152.0


## RFM Analysis

RFM (Recency, Frequency, Monetary) analysis is a customer segmentation technique that uses past purchase behavior to divide customers into groups. RFM helps divide customers into various categories or clusters to identify customers who are more likely to respond to promotions and also for future personalization services.

* RECENCY (R): Days since last purchase =>df_customers['Days without purchase'] 
* FREQUENCY (F): Total number of purchases => df_customers['Number Orders']
* MONETARY VALUE (M): Total money this customer spent  => df_customers['Total Spent']

In [12]:
quantiles = df_rfm.quantile(q=[0.25,0.5,0.75])
quantiles

Unnamed: 0,Number Orders,Total Spent,Total Items,Days as customer,Days without purchase
0.25,1.0,146.0,1.0,213.0,60.0
0.5,2.0,336.0,2.0,305.0,244.0
0.75,3.0,733.0,3.0,366.0,305.0


In [13]:
quantiles.to_dict()

{'Days as customer': {0.25: 213.0, 0.5: 305.0, 0.75: 366.0},
 'Days without purchase': {0.25: 60.0, 0.5: 244.0, 0.75: 305.0},
 'Number Orders': {0.25: 1.0, 0.5: 2.0, 0.75: 3.0},
 'Total Items': {0.25: 1.0, 0.5: 2.0, 0.75: 3.0},
 'Total Spent': {0.25: 146.0, 0.5: 336.0, 0.75: 733.0}}

In [14]:
def loyal_customer(row):
    if ( row['Days without purchase'] <= quantiles['Days without purchase'][0.25] ):
        return 4 
    elif ( row['Days without purchase'] <= quantiles['Days without purchase'][0.50] ):
        return 3
    elif (row['Days without purchase'] <= quantiles['Days without purchase'][0.75]):
        return 2
    else:
        return 1 

In [15]:
# DORMANT CUSTOMER 
def dormant_customer(row):
    if ((row['Days as customer'] >= quantiles['Days as customer'][0.50]) and 
        (row['Days without purchase'] >= quantiles['Days as customer'][0.50])):
        return 'Yes' 
    else:
        return 'No'    

In [16]:
# MONETARY VALUE (M): Total money this customer spent => df_customers['Total Spent']

def big_spender(row):
    if row['Total Spent'] <= quantiles['Total Spent'][0.25]:
        return 1
    elif row['Total Spent'] <= quantiles['Total Spent'][0.50]:
        return 2
    elif row['Total Spent'] <= quantiles['Total Spent'][0.75]:
        return 3
    else:
        return 4  

In [17]:
# FREQUENCY VALUE (F):  Total number of purchases => df_customers['Number Orders']

def many_orders(row):
    if row['Number Orders'] <= quantiles['Number Orders'][0.25]:
        return 1
    elif row['Number Orders'] <= quantiles['Number Orders'][0.50]:
        return 2
    elif row['Number Orders'] <= quantiles['Number Orders'][0.75]:
        return 3
    else:
        return 4  

In [19]:

df_rfm['Loyal Customer'] = df_rfm.apply(loyal_customer, axis=1)
df_rfm['Many Orders'] = df_rfm.apply(many_orders, axis=1)
df_rfm['Big Spender'] = df_rfm.apply(big_spender, axis=1)
df_rfm['Dormant Customer'] = df_rfm.apply(dormant_customer, axis=1)

df_rfm['RFMScore'] = df_rfm['Loyal Customer'].map(str) \
                    + df_rfm['Many Orders'].map(str) \
                    + df_rfm['Big Spender'].map(str)

# Save Customers dataset to csv file

df_rfm.to_csv('data/complete_customers.csv',index=False)

df_rfm.head()

Unnamed: 0,Customer ID,Number Orders,Total Spent,Total Items,Days as customer,Days without purchase,Loyal Customer,Many Orders,Big Spender,Dormant Customer,RFMScore
0,DL1004,4,903.0,4.0,29.0,1.0,4,4,4,No,444
1,DL1005,9,2581.0,9.0,366.0,1.0,4,4,4,No,444
2,DL1006,5,200.0,5.0,274.0,274.0,2,4,2,No,242
3,DL1007,28,5824.0,28.0,425.0,1.0,4,4,4,No,444
4,DL1009,4,932.0,4.0,305.0,152.0,3,4,4,No,344


## How many Customers do we have in each segment?


In [20]:
print("Best Customers: ",len(df_rfm[df_rfm['RFMScore']=='444']))
print('Loyal Customers: ',len(df_rfm[df_rfm['Loyal Customer']==4]))
print("Big Spenders: ",len(df_rfm[df_rfm['Big Spender']==4]))

almost_lost = len(df_rfm[df_rfm['RFMScore']=='244'])
almost_lost += len(df_rfm[df_rfm['RFMScore']=='233'])
print('Almost Lost: ', almost_lost)

lost_customers = len(df_rfm[df_rfm['RFMScore']=='144'])
lost_customers += len(df_rfm[df_rfm['RFMScore']=='133'])
print('Lost Customers: ',lost_customers)

print('Lost Cheap Customers: ',len(df_rfm[df_rfm['RFMScore']=='111']))

Best Customers:  53
Loyal Customers:  182
Big Spenders:  145
Almost Lost:  20
Lost Customers:  6
Lost Cheap Customers:  45
