## Import Libraries

In [209]:
import pandas as pd
from matplotlib import pyplot as plt
import datetime as dt
import numpy as np
import math

## Load dataset

In [175]:
df = pd.read_csv('SampleAssessment.csv')
print(df.shape)
print(df.columns.tolist())
print(df.dtypes)

(10000, 11)
['customer_id', 'First Time', 'Recent Time', '# of Orders', '# of Orders in last 7 days', '# of Orders in last 4 weeks', 'Amount', 'Amount in last 7 days', 'Amount in last 4 weeks', 'Avg_DistanceFromResturant', 'Avg_DeliveryTime']
customer_id                      int64
First Time                      object
Recent Time                     object
# of Orders                      int64
# of Orders in last 7 days     float64
# of Orders in last 4 weeks    float64
Amount                           int64
Amount in last 7 days            int64
Amount in last 4 weeks           int64
Avg_DistanceFromResturant      float64
Avg_DeliveryTime                 int64
dtype: object


## Convert object to time

In [176]:
cols = ['First Time', 'Recent Time']
for column in cols:
    df[column] = pd.to_datetime(df[column])
df.dtypes

customer_id                             int64
First Time                     datetime64[ns]
Recent Time                    datetime64[ns]
# of Orders                             int64
# of Orders in last 7 days            float64
# of Orders in last 4 weeks           float64
Amount                                  int64
Amount in last 7 days                   int64
Amount in last 4 weeks                  int64
Avg_DistanceFromResturant             float64
Avg_DeliveryTime                        int64
dtype: object

## count of unique vs null values

In [177]:
summary_df = pd.DataFrame(columns=['num_unique', 'num_nulls'])
for column in df.columns:
    summary_df.loc[column] = [df[column].nunique(), df[column].isna().sum()]
summary_df

Unnamed: 0,num_unique,num_nulls
customer_id,10000,0
First Time,9818,0
Recent Time,9759,0
# of Orders,118,0
# of Orders in last 7 days,14,8077
# of Orders in last 4 weeks,39,5659
Amount,3910,0
Amount in last 7 days,981,0
Amount in last 4 weeks,1916,0
Avg_DistanceFromResturant,66,0


### searching for null values

In [178]:
df.isnull().sum(axis=0)

customer_id                       0
First Time                        0
Recent Time                       0
# of Orders                       0
# of Orders in last 7 days     8077
# of Orders in last 4 weeks    5659
Amount                            0
Amount in last 7 days             0
Amount in last 4 weeks            0
Avg_DistanceFromResturant         0
Avg_DeliveryTime                  0
dtype: int64

### Only the below columns have null values, replace nulls with zeros
* '# of Orders in last 7 day'
* '# of Orders in last 4 weeks'

In [179]:
cols = ['# of Orders in last 7 days', '# of Orders in last 4 weeks']
for column in cols:
    df[column].fillna(0, inplace=True)
print(df[df['# of Orders in last 7 days'].isna()].shape)
print(df[df['# of Orders in last 4 weeks'].isna()].shape)

(0, 11)
(0, 11)


## Adding average order value column

In [180]:
df['average_cost'] = df.Amount / df['# of Orders']

## Adding a time difference column

In [181]:
df['time_diff'] =  df['Recent Time'] - df['First Time']

In [182]:
df.sample(5)

Unnamed: 0,customer_id,First Time,Recent Time,# of Orders,# of Orders in last 7 days,# of Orders in last 4 weeks,Amount,Amount in last 7 days,Amount in last 4 weeks,Avg_DistanceFromResturant,Avg_DeliveryTime,average_cost,time_diff
494,192338,2015-07-07 10:05:00,2015-12-12 05:14:00,28,0.0,2.0,14574,0,812,3.5,36,520.5,157 days 19:09:00
7223,537629,2015-08-06 15:06:00,2015-08-07 12:56:00,1,0.0,0.0,346,0,0,1.7,30,346.0,0 days 21:50:00
6806,408372,2015-07-25 02:16:00,2015-07-28 10:31:00,1,0.0,0.0,421,0,0,4.1,19,421.0,3 days 08:15:00
7963,1054883,2015-12-07 10:42:00,2015-12-11 20:48:00,1,1.0,1.0,508,508,508,2.5,31,508.0,4 days 10:06:00
7614,979246,2015-12-04 05:15:00,2015-12-06 18:15:00,1,0.0,1.0,53,0,53,1.3,29,53.0,2 days 13:00:00


In [183]:
df.columns

Index(['customer_id', 'First Time', 'Recent Time', '# of Orders',
       '# of Orders in last 7 days', '# of Orders in last 4 weeks', 'Amount',
       'Amount in last 7 days', 'Amount in last 4 weeks',
       'Avg_DistanceFromResturant', 'Avg_DeliveryTime', 'average_cost',
       'time_diff'],
      dtype='object')

In [184]:
print(df['First Time'].min())
print(df['First Time'].max())

print(df['Recent Time'].min())
print(df['Recent Time'].max())

2015-06-05 17:21:00
2016-01-07 12:42:00
2015-06-08 22:07:00
2016-01-13 05:54:00


## Generating RMF metrics

In [185]:
rename_dict = {"# of Orders": "Frequency", "Amount": "Monetory"}
df.rename(rename_dict, inplace=True, axis=1)
x = (dt.datetime.utcnow() - df['Recent Time'])
df['Recency'] = df['Recent Time'].apply(lambda x: (dt.datetime.utcnow() - x).days)

In [186]:
df.dtypes

customer_id                              int64
First Time                      datetime64[ns]
Recent Time                     datetime64[ns]
Frequency                                int64
# of Orders in last 7 days             float64
# of Orders in last 4 weeks            float64
Monetory                                 int64
Amount in last 7 days                    int64
Amount in last 4 weeks                   int64
Avg_DistanceFromResturant              float64
Avg_DeliveryTime                         int64
average_cost                           float64
time_diff                      timedelta64[ns]
Recency                                  int64
dtype: object

## Creating quantiles

In [187]:
quantile_baskets = [0.2, 0.4, 0.6, 0.8, 1]
df_quantiles_dict = df.quantile(quantile_baskets).to_dict()

In [188]:
print(df_quantiles_dict)

{'customer_id': {0.2: 267988.4, 0.4: 532701.8, 0.6: 802909.6, 0.8: 1075403.8, 1.0: 1355445.0}, 'Frequency': {0.2: 1.0, 0.4: 2.0, 0.6: 3.0, 0.8: 9.0, 1.0: 212.0}, '# of Orders in last 7 days': {0.2: 0.0, 0.4: 0.0, 0.6: 0.0, 0.8: 0.0, 1.0: 14.0}, '# of Orders in last 4 weeks': {0.2: 0.0, 0.4: 0.0, 0.6: 1.0, 0.8: 2.0, 1.0: 46.0}, 'Monetory': {0.2: 218.0, 0.4: 495.0, 0.6: 1036.0, 0.8: 2689.2000000000007, 1.0: 138808.0}, 'Amount in last 7 days': {0.2: 0.0, 0.4: 0.0, 0.6: 0.0, 0.8: 0.0, 1.0: 10150.0}, 'Amount in last 4 weeks': {0.2: 0.0, 0.4: 0.0, 0.6: 66.0, 0.8: 567.0, 1.0: 26853.0}, 'Avg_DistanceFromResturant': {0.2: 1.5, 0.4: 2.1, 0.6: 2.6, 0.8: 3.2, 1.0: 5.9}, 'Avg_DeliveryTime': {0.2: 24.0, 0.4: 32.0, 0.6: 41.0, 0.8: 50.0, 1.0: 83.0}, 'average_cost': {0.2: 111.0, 0.4: 221.65882352941185, 0.6: 348.4571428571428, 0.8: 535.3632183908047, 1.0: 9906.5}, 'Recency': {0.2: 1013.0, 0.4: 1031.0, 0.6: 1059.0, 0.8: 1102.0, 1.0: 1195.0}}


In [189]:
def r_score(curr_value, quantile_dict):
    for index, quantile in quantile_dict.items():
        if curr_value <= quantile:
            return quantile_baskets.index(index) + 1
    
def f_m_score(curr_value, quantile_dict):
    for index, quantile in quantile_dict.items():
        if curr_value <= quantile:
            return len(quantile_dict) - quantile_baskets.index(index)

In [190]:
df['R'] = df['Recency'].apply(lambda x: r_score(x, df_quantiles_dict['Recency']))
df['M'] = df['Monetory'].apply(lambda x: f_m_score(x, df_quantiles_dict['Monetory']))
df['F'] = df['Frequency'].apply(lambda x: f_m_score(x, df_quantiles_dict['Frequency']))
df['RMF'] = df.R.apply(str) + df.M.apply(str) + df.F.apply(str)

In [191]:
df_quantiles_dict['Recency'].items()

dict_items([(0.2, 1013.0), (0.4, 1031.0), (0.6, 1059.0), (0.8, 1102.0), (1.0, 1195.0)])

In [192]:
df.sample(5)

Unnamed: 0,customer_id,First Time,Recent Time,Frequency,# of Orders in last 7 days,# of Orders in last 4 weeks,Monetory,Amount in last 7 days,Amount in last 4 weeks,Avg_DistanceFromResturant,Avg_DeliveryTime,average_cost,time_diff,Recency,R,M,F,RMF
607,1164911,2015-07-04 02:38:00,2015-11-24 03:03:00,25,0.0,1.0,1769,0,55,2.2,49,70.76,143 days 00:25:00,1027,2,2,1,221
3620,1160659,2015-07-09 05:12:00,2015-09-09 00:57:00,4,0.0,0.0,1225,0,0,3.5,56,306.25,61 days 19:45:00,1103,5,2,2,522
973,79426,2015-07-13 10:31:00,2015-10-06 00:32:00,18,0.0,0.0,4212,0,0,2.3,49,234.0,84 days 14:01:00,1076,4,1,1,411
3972,404938,2015-07-18 04:45:00,2015-08-29 09:52:00,4,0.0,0.0,845,0,0,1.2,19,211.25,42 days 05:07:00,1114,5,3,2,532
4583,731799,2015-09-25 23:51:00,2015-12-13 16:58:00,3,1.0,1.0,4141,2013,2013,2.9,38,1380.333333,78 days 17:07:00,1008,1,1,3,113


In [196]:
df[df['RMF'] == '111'].shape

(647, 18)

In [218]:
window = 7

den = math.factorial(window)
def buyer_propensity(mean, window):
    return np.exp(-mean) * np.power(mean, window) / den


df['mean'] = df.time_diff.apply(lambda x: x.days) / df.Frequency
df['7_days_propensity'] = df.mean.apply(lambda x : buyer_propensity(x, window))

In [220]:
df[df['7_days_propensity'] > 0.1].shape

(1637, 20)

In [211]:
math.factorial(4)

24