In [1]:
import pandas as pd
import numpy as np
from random import randrange
import seaborn as sb
from datetime import date,timedelta
%pylab inline 

Populating the interactive namespace from numpy and matplotlib


In [18]:
def random_date(start, end):
    """
    This function returns a random datetime between two datetime
    objects
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)


def create_random_data_set(timeframe, num_clients, n, num_products, avg_sale):
    """
    create_random_data_set simulates a sample to set to play with. The function replicates
    data based on given parameters

    """
    frame_out = pd.DataFrame(index=range(0, n))
    frame_out['sales'] = np.random.rand(n, 1) * avg_sale
    frame_out['date'] = [random_date(pd.to_datetime(timeframe[0]), pd.to_datetime(timeframe[1])) for i in range(n)]
    frame_out['client_id'] = [np.random.randint(0, num_clients) for i in range(n)]
    frame_out['product_id'] = [np.random.randint(0, num_products) for i in range(n)]
    frame_out['client_name'] = 'Generic name'
    frame_out = frame_out.sort_values('date')
    return frame_out


def assign_segment(frame_in):
    """
    assign_segment performs propietary algortihm to assign a meaningful segment to each client
    according to their customer behavior

    :param
    frame_in: Pandas DataFrame object with RFM tags

    :return:
    frame_out: pandas DataFrame with client_id and assigned segment
    """

    segment_names = [name + str(i) for i, name in enumerate(['segment_'] * 9)]
    frame_out = pd.DataFrame(list(frame_in['client_id'].unique()), columns=['client_id'])
    frame_out['segment'] = np.random.choice(segment_names, len(frame_in['client_id'].unique()))
    return pd.merge(frame_in, frame_out, on='client_id')


def run_RFM_analysis(frame, n_groups, alpha):
    """
    run_RFM_analysis performs basic analysis in a two stage process
    :param
    frame:  Pandas DataFrame with core client info.
            Columns are: (sales,date,etc,etc)
    :return:
    scores
    """

    scores = create_scores(frame, n_groups, alpha)
    scores = assign_segment(scores)
    other_vars = create_other_vars(frame)
    
    return pd.merge(scores,other_vars,on='client_id',how='inner',validate='1:1')

def create_other_vars(frame_in):
    other_vars = frame_in.groupby('client_id').sum()['sales'].to_frame(name='sales')
    other_vars.reset_index(inplace=True)

    return other_vars

def create_scores(frame_in, groups, weights):
    """
    create_scores creates RFM scores for sales date (frame_in)
    :param
    frame_in:   Pandas DataFrame with core client info

    :return:
    scores:
    """
    today = pd.to_datetime(date.today())
    first_date = frame_in.groupby('client_id').min()['date'].to_frame(name='first_purchase')
    last_date = frame_in.groupby('client_id').max()['date'].to_frame(name='last_purchase')
    time_since_last = (today-last_date['last_purchase']).apply(lambda x: int(x.days / 30)).to_frame(name='months_since_last')

    # Verify calculation
    recency = (today - last_date).apply(lambda x: int(x[0].days / 30), axis=1).to_frame(name='recency')
    age = (today - first_date).apply(lambda x: int(x[0].days / 30), axis=1).to_frame(name='age')
    monetary = frame_in.groupby('client_id').sum()['sales'].to_frame(name='monetary')
    # products = frame_in.groupby('client_id').agg({'product_id':np.size})['product_id'].to_frame(name='products')
    frequency = (((today - first_date).apply(lambda x: int(x[0].days / 30), axis=1))/(frame_in.groupby('client_id').size())).to_frame(name='frequency')


    scores = pd.concat([recency, frequency, monetary, age], axis=1).apply(
        lambda x: pd.qcut(x, q=groups, labels=[i for i in range(1, groups + 1)],duplicates='raise').astype(int), axis=0)

    metrics = pd.concat([recency, frequency, monetary, age],axis=1)
    metrics.columns = [col+'_value' for col in metrics.columns]
    scores = pd.concat([scores,metrics],axis=1)
    
    scores = pd.concat([first_date,last_date,time_since_last,scores],axis=1)
    scores['score'] = scores['recency'] * weights[0] + scores['frequency'] * weights[1] + scores['monetary'] * weights[2] + scores['age'] * weights[3]
    scores['group'] = scores['recency'].map(str) + scores['frequency'].map(str) + scores['monetary'].map(str) + scores['age'].map(str)
    scores['tenure'] = age['age']

    scores = scores.sort_values(by=['score'],ascending=False).reset_index()

    return scores


### 0.2 Parameters

In [19]:
# Sample data set parameters
num_clients = 100
timeframe = '2016-01-01', '2019-01-01'
num_products = 3
num_transactions = 100
avg_sale = 1000 # USD

# RFM analysis
alpha = [0.3,0.3,0.2,0.2]
n_groups = 5

### 1. Data creation 

This section simulates a sample data set

In [20]:
data = create_random_data_set(timeframe, num_clients, num_transactions, num_products, avg_sale)
data.head()

Unnamed: 0,sales,date,client_id,product_id,client_name
46,23.141692,2016-01-11 01:34:41,34,0,Generic name
70,669.370624,2016-01-16 08:04:54,84,0,Generic name
21,500.984333,2016-01-17 01:32:47,11,1,Generic name
87,755.12779,2016-02-08 09:09:43,93,1,Generic name
84,400.154531,2016-02-09 17:04:39,90,1,Generic name


### 2. Performing RFM analysis

In [21]:
scores = create_scores(data,n_groups,alpha)
scores.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure
0,74,2016-05-26 08:36:20,2016-05-26 08:36:20,40,5,5,4,5,40,40.0,990.218135,40,4.8,5545,40
1,81,2016-08-27 02:45:28,2016-08-27 02:45:28,37,5,5,4,4,37,37.0,912.502498,37,4.6,5544,37
2,7,2016-03-12 07:53:34,2016-03-12 07:53:34,43,5,5,3,5,43,43.0,757.887279,43,4.6,5535,43
3,84,2016-01-16 08:04:54,2016-01-16 08:04:54,45,5,5,3,5,45,45.0,669.370624,45,4.6,5535,45
4,60,2016-03-28 09:13:06,2016-10-14 04:30:02,36,5,3,5,5,36,21.0,1597.970382,42,4.4,5355,42


In [22]:
scores = assign_segment(scores)
scores.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure,segment
0,74,2016-05-26 08:36:20,2016-05-26 08:36:20,40,5,5,4,5,40,40.0,990.218135,40,4.8,5545,40,segment_4
1,81,2016-08-27 02:45:28,2016-08-27 02:45:28,37,5,5,4,4,37,37.0,912.502498,37,4.6,5544,37,segment_7
2,7,2016-03-12 07:53:34,2016-03-12 07:53:34,43,5,5,3,5,43,43.0,757.887279,43,4.6,5535,43,segment_8
3,84,2016-01-16 08:04:54,2016-01-16 08:04:54,45,5,5,3,5,45,45.0,669.370624,45,4.6,5535,45,segment_4
4,60,2016-03-28 09:13:06,2016-10-14 04:30:02,36,5,3,5,5,36,21.0,1597.970382,42,4.4,5355,42,segment_2


In [23]:
other_vars = create_other_vars(data)
other_vars.head()

Unnamed: 0,client_id,sales
0,0,679.828091
1,2,1164.89478
2,4,991.28937
3,5,1525.157485
4,7,757.887279


### 50. All in one run

In [24]:
out_table = run_RFM_analysis(data,n_groups,alpha)
out_table.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure,segment,sales
0,74,2016-05-26 08:36:20,2016-05-26 08:36:20,40,5,5,4,5,40,40.0,990.218135,40,4.8,5545,40,segment_4,990.218135
1,81,2016-08-27 02:45:28,2016-08-27 02:45:28,37,5,5,4,4,37,37.0,912.502498,37,4.6,5544,37,segment_6,912.502498
2,7,2016-03-12 07:53:34,2016-03-12 07:53:34,43,5,5,3,5,43,43.0,757.887279,43,4.6,5535,43,segment_2,757.887279
3,84,2016-01-16 08:04:54,2016-01-16 08:04:54,45,5,5,3,5,45,45.0,669.370624,45,4.6,5535,45,segment_8,669.370624
4,60,2016-03-28 09:13:06,2016-10-14 04:30:02,36,5,3,5,5,36,21.0,1597.970382,42,4.4,5355,42,segment_3,1597.970382


In [25]:
out_table[['score','client_id','tenure','last_purchase','months_since_last','sales','segment']].head()

Unnamed: 0,score,client_id,tenure,last_purchase,months_since_last,sales,segment
0,4.8,74,40,2016-05-26 08:36:20,40,990.218135,segment_4
1,4.6,81,37,2016-08-27 02:45:28,37,912.502498,segment_6
2,4.6,7,43,2016-03-12 07:53:34,43,757.887279,segment_2
3,4.6,84,45,2016-01-16 08:04:54,45,669.370624,segment_8
4,4.4,60,42,2016-10-14 04:30:02,36,1597.970382,segment_3


### 80. For Dash

In [26]:
table_3 = scores.groupby('segment').median()[['recency_value','frequency_value','monetary_value']].reset_index()
table_3

Unnamed: 0,segment,recency_value,frequency_value,monetary_value
0,segment_0,25.0,25.0,762.172202
1,segment_1,25.0,15.0,700.438169
2,segment_2,31.0,25.25,762.090154
3,segment_3,25.0,24.0,557.116777
4,segment_4,26.5,20.75,754.544804
5,segment_5,14.0,14.5,715.142867
6,segment_6,20.5,10.0,1114.365281
7,segment_7,27.0,27.0,912.502498
8,segment_8,27.0,27.0,683.464691


In [42]:
health = (scores.groupby('segment').sum()['monetary_value']/scores.monetary_value.sum()).to_frame('sales_share')
health['size'] = scores.segment.value_counts(normalize=True)
health.reset_index()

Unnamed: 0,segment,sales_share,size
0,segment_0,0.076027,0.061538
1,segment_1,0.15527,0.169231
2,segment_2,0.104164,0.092308
3,segment_3,0.126428,0.169231
4,segment_4,0.076767,0.092308
5,segment_5,0.104906,0.092308
6,segment_6,0.167492,0.123077
7,segment_7,0.11242,0.107692
8,segment_8,0.076526,0.092308


### 99. TODO 

In [155]:
# remainig to cut extreme values