<img src='business.jpg' />

# the probability transition matrix

With the probability transition matrix we can simulate how the segments will change in the future and maybe realize that we are dangerously reducing active customers in favor of inactive ones and that we need to acquire a slightly bigger number of each kind of customer to avoid a decrease in profits.<br>
In any case with this matrix we can simulate several years ahead and estimate how many customers will still be active. We can also estimate their value by multiplying the average value of each segment by the number of customers of the same segment in a specific year (year 0, year +1, year +2, etc.).

## Loading the required Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [2]:
data = pd.read_csv('purchases.txt', sep='\t', encoding='utf-8',header=None,  
                       usecols=[0, 1, 2], names=["customers_id", "purchase_amount", "date_of_purchase"], parse_dates=['date_of_purchase'])
data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase
0,760,25.0,2009-11-06
1,860,50.0,2012-09-28
2,1200,100.0,2005-10-25
3,1420,50.0,2009-07-09
4,1940,70.0,2013-01-25


In [3]:
data['year_of_purchase'] = data['date_of_purchase'].dt.year

# converting to datetime
time1 = pd.Timestamp('2016-01-01')

# And then we're going to compute something a bit specific. 
#We're going to compute the number of days that lapse between 
#January 1st, 2016 and the date of purchase in the data. 
data['days_since'] = (time1 - data['date_of_purchase']).dt.days

data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase,year_of_purchase,days_since
0,760,25.0,2009-11-06,2009,2247
1,860,50.0,2012-09-28,2012,1190
2,1200,100.0,2005-10-25,2005,3720
3,1420,50.0,2009-07-09,2009,2367
4,1940,70.0,2013-01-25,2013,1071


# Data Preparation

We are going to analysis the Customers based on below 3 factors:
- R (Recency): Number of days since last purchase
- F (Frequency): Number of tracsactions
- M (Monetary): Total amount of transactions (revenue contributed)

Criarei os clientes de 2015 simulando que estou usando funções de sql.

In [4]:
customers_2015 = pysqldf("select customers_id, min(days_since) as 'recêncy', max(days_since) as 'first_purchase', count(*) as 'frequency', avg(purchase_amount) as 'amount' from data group by 1")
customers_2015

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount
0,10,3829,3829,1,30.000000
1,80,343,3751,7,71.428571
2,90,758,3783,10,115.800000
3,120,1401,1401,1,20.000000
4,130,2970,3710,2,50.000000
...,...,...,...,...,...
18412,263820,1,1,1,10.000000
18413,263870,135,135,1,50.000000
18414,263880,34,34,1,20.000000
18415,263890,5,5,1,54.000000


In [5]:
# More complex 4-segment solution using which
def recency_lab(customers_2015) :
    
    if customers_2015["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2015["recêncy"] <= 365*3) & (customers_2015["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2015["recêncy"] <= 365*2) & (customers_2015["recêncy"] > 365*1 ):
        return "warm"
    else:
        return 'active'
customers_2015["segment"] = customers_2015.apply(lambda customers_2015:recency_lab(customers_2015),
                                      axis = 1)
customers_2015.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3829,3829,1,30.0,inactive
1,80,343,3751,7,71.428571,active
2,90,758,3783,10,115.8,cold
3,120,1401,1401,1,20.0,inactive
4,130,2970,3710,2,50.0,inactive


In [6]:
def f(customers_2015):
    if customers_2015["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2015["recêncy"] <= 365*3) & (customers_2015["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2015['segment'] == 'warm') & (customers_2015["first_purchase"] <= 365*2 ):
        return "new warm"
    elif (customers_2015['segment'] == 'warm') & (customers_2015["amount"] < 100 ):
        return "warm low value"
    elif (customers_2015['segment'] == 'warm') & (customers_2015["amount"] >= 100 ):
        return "warm high value"
    elif (customers_2015['segment'] == 'active') & (customers_2015["first_purchase"] <= 365 ):
        return "new active"
    elif (customers_2015['segment'] == 'active') & (customers_2015["amount"] < 100 ):
        return "active low value"
    else:
        
        return "active high value"
   

customers_2015['segment'] = customers_2015.apply(f, axis=1)
customers_2015.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3829,3829,1,30.0,inactive
1,80,343,3751,7,71.428571,active low value
2,90,758,3783,10,115.8,cold
3,120,1401,1401,1,20.0,inactive
4,130,2970,3710,2,50.0,inactive


Using the same technique as above I will create the 2014 customers.

In [7]:
# Compute recency, frequency, and average purchase amount
customers_2014 = pysqldf("select customers_id, min(days_since) - 365 as 'recêncy', max(days_since) - 365 as 'first_purchase', count(*) as 'frequency', avg(purchase_amount) as 'amount' from data where days_since > 365 group by 1")
customers_2014

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount
0,10,3464,3464,1,30.0
1,80,302,3386,6,70.0
2,90,393,3418,10,115.8
3,120,1036,1036,1,20.0
4,130,2605,3345,2,50.0
...,...,...,...,...,...
16900,235200,21,21,1,100.0
16901,235210,89,89,1,500.0
16902,236310,6,6,1,30.0
16903,236660,390,684,2,75.0


In [8]:
# More complex 4-segment solution using which
def recency_lab(customers_2014) :
    
    if customers_2014["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2014["recêncy"] <= 365*3) & (customers_2014["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2014["recêncy"] <= 365*2) & (customers_2014["recêncy"] > 365*1 ):
        return "warm"
    else:
        return 'active'
customers_2014["segment"] = customers_2014.apply(lambda customers_2014:recency_lab(customers_2014),
                                      axis = 1)
customers_2014.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3464,3464,1,30.0,inactive
1,80,302,3386,6,70.0,active
2,90,393,3418,10,115.8,warm
3,120,1036,1036,1,20.0,cold
4,130,2605,3345,2,50.0,inactive


In [9]:
# Complete segment solution using which, and exploiting previous test as input
def f(customers_2014):
    if customers_2014["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2014["recêncy"] <= 365*3) & (customers_2014["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2014['segment'] == 'warm') & (customers_2014["first_purchase"] <= 365*2 ):
        return "new warm"
    elif (customers_2014['segment'] == 'warm') & (customers_2014["amount"] < 100 ):
        return "warm low value"
    elif (customers_2014['segment'] == 'warm') & (customers_2014["amount"] >= 100 ):
        return "warm high value"
    elif (customers_2014['segment'] == 'active') & (customers_2014["first_purchase"] <= 365 ):
        return "new active"
    elif (customers_2014['segment'] == 'active') & (customers_2014["amount"] < 100 ):
        return "active low value"
    else:
        
        return "active high value"
   

customers_2014['segment'] = customers_2014.apply(f, axis=1)
customers_2014.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3464,3464,1,30.0,inactive
1,80,302,3386,6,70.0,active low value
2,90,393,3418,10,115.8,warm high value
3,120,1036,1036,1,20.0,cold
4,130,2605,3345,2,50.0,inactive


## COMPUTE TRANSITION MATRIX

First I’ll merge the 2015 and 2014 data

In [10]:
# Merge 2015 customers and 2014 customers
new_data = pd.merge_ordered(customers_2014, customers_2015, on='customers_id', how="outer")
new_data.head()

Unnamed: 0,customers_id,recêncy_x,first_purchase_x,frequency_x,amount_x,segment_x,recêncy_y,first_purchase_y,frequency_y,amount_y,segment_y
0,10,3464.0,3464.0,1.0,30.0,inactive,3829,3829,1,30.0,inactive
1,80,302.0,3386.0,6.0,70.0,active low value,343,3751,7,71.428571,active low value
2,90,393.0,3418.0,10.0,115.8,warm high value,758,3783,10,115.8,cold
3,120,1036.0,1036.0,1.0,20.0,cold,1401,1401,1,20.0,inactive
4,130,2605.0,3345.0,2.0,50.0,inactive,2970,3710,2,50.0,inactive


In [11]:
lista_order = ['inactive ',' cold ','warm high value ','warm low value','new warm ','active high value','active low value','new active ']

In [12]:
transition = pd.crosstab(new_data.segment_x, new_data.segment_y)
transition

segment_y,active high value,active low value,cold,inactive,new warm,warm high value,warm low value
segment_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
active high value,354,2,0,0,0,119,0
active low value,22,2088,0,0,0,0,901
cold,22,200,0,1931,0,0,0
inactive,35,250,0,7227,0,0,0
new active,89,410,0,0,938,0,0
new warm,15,96,1139,0,0,0,0
warm high value,35,1,75,0,0,0,0
warm low value,1,266,689,0,0,0,0


In [15]:
transition.loc[lista_order,lista_order]

segment_y,inactive,cold,warm high value,warm low value,new warm,active high value,active low value,new active
segment_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
inactive,,,,,,,,
cold,,,,,,,,
warm high value,,,,,,,,
warm low value,,,,0.0,,1.0,266.0,
new warm,,,,,,,,
active high value,,,,0.0,,354.0,2.0,
active low value,,,,901.0,,22.0,2088.0,
new active,,,,,,,,
