In [35]:
import pandas as pd
import numpy as np
from random import randrange
import seaborn as sb
from datetime import date,timedelta
%pylab inline 

Populating the interactive namespace from numpy and matplotlib


In [36]:
def random_date(start, end):
    """
    This function returns a random datetime between two datetime
    objects
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)


def create_random_data_set(timeframe, num_clients, n, num_products, avg_sale):
    """
    create_random_data_set simulates a sample to set to play with. The function replicates
    data based on given parameters

    """
    frame_out = pd.DataFrame(index=range(0, n))
    frame_out['sales'] = np.random.rand(n, 1) * avg_sale
    frame_out['date'] = [random_date(pd.to_datetime(timeframe[0]), pd.to_datetime(timeframe[1])) for i in range(n)]
    frame_out['client_id'] = [np.random.randint(0, num_clients) for i in range(n)]
    frame_out['product_id'] = [np.random.randint(0, num_products) for i in range(n)]
    frame_out['client_name'] = 'Generic name'
    frame_out = frame_out.sort_values('date')
    return frame_out


def assign_segment(frame_in):
    """
    assign_segment performs propietary algortihm to assign a meaningful segment to each client
    according to their customer behavior

    :param
    frame_in: Pandas DataFrame object with RFM tags

    :return:
    frame_out: pandas DataFrame with client_id and assigned segment
    """

    segment_names = [name + str(i) for i, name in enumerate(['segment_'] * 9)]
    frame_out = pd.DataFrame(list(frame_in['client_id'].unique()), columns=['client_id'])
    frame_out['segment'] = np.random.choice(segment_names, len(frame_in['client_id'].unique()))
    return pd.merge(frame_in, frame_out, on='client_id')


def run_RFM_analysis(frame, n_groups, alpha):
    """
    run_RFM_analysis performs basic analysis in a two stage process
    :param
    frame:  Pandas DataFrame with core client info.
            Columns are: (sales,date,etc,etc)
    :return:
    scores
    """

    scores = create_scores(frame, n_groups, alpha)
    scores = assign_segment(scores)
    other_vars = create_other_vars(frame)
    
    return pd.merge(scores,other_vars,on='client_id',how='inner',validate='1:1')

def create_other_vars(frame_in):
    other_vars = frame_in.groupby('client_id').sum()['sales'].to_frame(name='sales')
    other_vars.reset_index(inplace=True)

    return other_vars

def create_scores(frame_in, groups, weights):
    """
    create_scores creates RFM scores for sales date (frame_in)
    :param
    frame_in:   Pandas DataFrame with core client info

    :return:
    scores:
    """
    today = pd.to_datetime(date.today())
    first_date = frame_in.groupby('client_id').min()['date'].to_frame(name='first_purchase')
    last_date = frame_in.groupby('client_id').max()['date'].to_frame(name='last_purchase')
    time_since_last = (today-last_date['last_purchase']).apply(lambda x: int(x.days / 30)).to_frame(name='months_since_last')

    # Verify calculation
    recency = (today - last_date).apply(lambda x: int(x[0].days / 30), axis=1).to_frame(name='recency')
    age = (today - first_date).apply(lambda x: int(x[0].days / 30), axis=1).to_frame(name='age')
    monetary = frame_in.groupby('client_id').sum()['sales'].to_frame(name='monetary')
    # products = frame_in.groupby('client_id').agg({'product_id':np.size})['product_id'].to_frame(name='products')
    frequency = (((today - first_date).apply(lambda x: int(x[0].days / 30), axis=1))/(frame_in.groupby('client_id').size())).to_frame(name='frequency')


    scores = pd.concat([recency, frequency, monetary, age], axis=1).apply(
        lambda x: pd.qcut(x, q=groups, labels=[i for i in range(1, groups + 1)],duplicates='raise').astype(int), axis=0)

    metrics = pd.concat([recency, frequency, monetary, age],axis=1)
    metrics.columns = [col+'_value' for col in metrics.columns]
    scores = pd.concat([scores,metrics],axis=1)
    
    scores = pd.concat([first_date,last_date,time_since_last,scores],axis=1)
    scores['score'] = scores['recency'] * weights[0] + scores['frequency'] * weights[1] + scores['monetary'] * weights[2] + scores['age'] * weights[3]
    scores['group'] = scores['recency'].map(str) + scores['frequency'].map(str) + scores['monetary'].map(str) + scores['age'].map(str)
    scores['tenure'] = age['age']

    scores = scores.sort_values(by=['score'],ascending=False).reset_index()

    return scores


### 0.2 Parameters

In [37]:
# Sample data set parameters
num_clients = 100
timeframe = '2016-01-01', '2019-01-01'
num_products = 3
num_transactions = 100
avg_sale = 1000 # USD

# RFM analysis
alpha = [0.3,0.3,0.2,0.2]
n_groups = 5

### 1. Data creation 

This section simulates a sample data set

In [38]:
data = create_random_data_set(timeframe, num_clients, num_transactions, num_products, avg_sale)
data.head()

Unnamed: 0,sales,date,client_id,product_id,client_name
19,199.886394,2016-01-01 20:30:22,80,0,Generic name
22,520.64823,2016-01-12 19:24:20,40,0,Generic name
95,427.975994,2016-01-14 04:05:40,98,2,Generic name
60,729.749636,2016-01-31 02:55:20,19,0,Generic name
58,611.746859,2016-02-12 13:48:50,9,0,Generic name


### 2. Performing RFM analysis

In [39]:
scores = create_scores(data,n_groups,alpha)
scores.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure
0,19,2016-01-31 02:55:20,2016-01-31 02:55:20,44,5,5,4,5,44,44.0,729.749636,44,4.8,5545,44
1,81,2016-11-04 02:09:27,2016-11-04 02:09:27,35,5,5,4,4,35,35.0,785.687669,35,4.6,5544,35
2,40,2016-01-12 19:24:20,2016-01-12 19:24:20,45,5,5,3,5,45,45.0,520.64823,45,4.6,5535,45
3,27,2016-06-02 23:16:05,2016-06-02 23:16:05,40,5,5,3,4,40,40.0,706.499091,40,4.4,5534,40
4,36,2016-05-23 18:14:38,2016-05-23 18:14:38,41,5,5,2,5,41,41.0,342.721225,41,4.4,5525,41


In [40]:
scores = assign_segment(scores)
scores.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure,segment
0,19,2016-01-31 02:55:20,2016-01-31 02:55:20,44,5,5,4,5,44,44.0,729.749636,44,4.8,5545,44,segment_0
1,81,2016-11-04 02:09:27,2016-11-04 02:09:27,35,5,5,4,4,35,35.0,785.687669,35,4.6,5544,35,segment_1
2,40,2016-01-12 19:24:20,2016-01-12 19:24:20,45,5,5,3,5,45,45.0,520.64823,45,4.6,5535,45,segment_4
3,27,2016-06-02 23:16:05,2016-06-02 23:16:05,40,5,5,3,4,40,40.0,706.499091,40,4.4,5534,40,segment_5
4,36,2016-05-23 18:14:38,2016-05-23 18:14:38,41,5,5,2,5,41,41.0,342.721225,41,4.4,5525,41,segment_3


In [41]:
other_vars = create_other_vars(data)
other_vars.head()

Unnamed: 0,client_id,sales
0,0,1190.464816
1,1,828.68895
2,2,2471.773022
3,3,155.917252
4,4,713.827518


### 50. All in one run

In [42]:
out_table = run_RFM_analysis(data,n_groups,alpha)
out_table.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value,frequency_value,monetary_value,age_value,score,group,tenure,segment,sales
0,19,2016-01-31 02:55:20,2016-01-31 02:55:20,44,5,5,4,5,44,44.0,729.749636,44,4.8,5545,44,segment_1,729.749636
1,81,2016-11-04 02:09:27,2016-11-04 02:09:27,35,5,5,4,4,35,35.0,785.687669,35,4.6,5544,35,segment_2,785.687669
2,40,2016-01-12 19:24:20,2016-01-12 19:24:20,45,5,5,3,5,45,45.0,520.64823,45,4.6,5535,45,segment_2,520.64823
3,27,2016-06-02 23:16:05,2016-06-02 23:16:05,40,5,5,3,4,40,40.0,706.499091,40,4.4,5534,40,segment_3,706.499091
4,36,2016-05-23 18:14:38,2016-05-23 18:14:38,41,5,5,2,5,41,41.0,342.721225,41,4.4,5525,41,segment_5,342.721225


In [43]:
out_table[['score','client_id','tenure','last_purchase','months_since_last','sales','segment']].head()

Unnamed: 0,score,client_id,tenure,last_purchase,months_since_last,sales,segment
0,4.8,19,44,2016-01-31 02:55:20,44,729.749636,segment_1
1,4.6,81,35,2016-11-04 02:09:27,35,785.687669,segment_2
2,4.6,40,45,2016-01-12 19:24:20,45,520.64823,segment_2
3,4.4,27,40,2016-06-02 23:16:05,40,706.499091,segment_3
4,4.4,36,41,2016-05-23 18:14:38,41,342.721225,segment_5


### 80. For Dash

In [44]:
table_3 = scores.groupby('segment').median()[['recency_value','frequency_value','monetary_value']].reset_index()
table_3

Unnamed: 0,segment,recency_value,frequency_value,monetary_value
0,segment_0,24.0,20.25,500.61166
1,segment_1,18.0,16.5,570.637207
2,segment_2,20.0,18.0,675.685319
3,segment_3,24.0,20.0,368.71327
4,segment_4,20.0,21.0,667.898623
5,segment_5,22.0,22.0,706.499091
6,segment_6,12.0,12.0,779.664473
7,segment_7,11.5,9.0,836.3871
8,segment_8,31.0,31.0,410.535589


**Healt-chart** 

In [45]:
health = (scores.groupby('segment').sum()['monetary_value']/scores.monetary_value.sum()).to_frame('sales_share')
health['size'] = scores.segment.value_counts(normalize=True)
health.reset_index()

Unnamed: 0,segment,sales_share,size
0,segment_0,0.160781,0.15625
1,segment_1,0.102845,0.140625
2,segment_2,0.129798,0.109375
3,segment_3,0.123991,0.140625
4,segment_4,0.12876,0.125
5,segment_5,0.195732,0.171875
6,segment_6,0.047691,0.046875
7,segment_7,0.035658,0.03125
8,segment_8,0.074743,0.078125


**Detailed lime**

In [46]:
scores = pd.merge(scores,table_3,on='segment',how='left',suffixes=('_client','_segment'))
scores['recency_diff'] = scores['recency_value_client'] - scores['recency_value_segment'] 
scores['frequency_diff'] = scores['frequency_value_client']- scores['frequency_value_segment'] 
scores['monetary_diff'] = scores['monetary_value_client'] - scores['monetary_value_segment'] 
scores.head()

Unnamed: 0,client_id,first_purchase,last_purchase,months_since_last,recency,frequency,monetary,age,recency_value_client,frequency_value_client,...,score,group,tenure,segment,recency_value_segment,frequency_value_segment,monetary_value_segment,recency_diff,frequency_diff,monetary_diff
0,19,2016-01-31 02:55:20,2016-01-31 02:55:20,44,5,5,4,5,44,44.0,...,4.8,5545,44,segment_0,24.0,20.25,500.61166,20.0,23.75,229.137976
1,81,2016-11-04 02:09:27,2016-11-04 02:09:27,35,5,5,4,4,35,35.0,...,4.6,5544,35,segment_1,18.0,16.5,570.637207,17.0,18.5,215.050463
2,40,2016-01-12 19:24:20,2016-01-12 19:24:20,45,5,5,3,5,45,45.0,...,4.6,5535,45,segment_4,20.0,21.0,667.898623,25.0,24.0,-147.250393
3,27,2016-06-02 23:16:05,2016-06-02 23:16:05,40,5,5,3,4,40,40.0,...,4.4,5534,40,segment_5,22.0,22.0,706.499091,18.0,18.0,0.0
4,36,2016-05-23 18:14:38,2016-05-23 18:14:38,41,5,5,2,5,41,41.0,...,4.4,5525,41,segment_3,24.0,20.0,368.71327,17.0,21.0,-25.992045


Unnamed: 0,factor,importance
0,recency_diff,20.0
1,frequency_diff,23.75
2,monetary_diff,229.137976


In [None]:
pd.melt()

### 99. TODO 

In [155]:
# remainig to cut extreme values