In [364]:
import pandas as pd
import numpy as np
import datetime as dt

In [365]:
data = pd.read_csv('purchases.txt', sep='\t', encoding='utf-8',header=None,  
                       usecols=[0, 1, 2], names=["customers_id", "purchase_amount", "date_of_purchase"], parse_dates=['date_of_purchase'])
data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase
0,760,25.0,2009-11-06
1,860,50.0,2012-09-28
2,1200,100.0,2005-10-25
3,1420,50.0,2009-07-09
4,1940,70.0,2013-01-25


In [366]:
data['year_of_purchase'] = data['date_of_purchase'].dt.year

# converting to datetime
time1 = pd.Timestamp('2016-01-01')

# And then we're going to compute something a bit specific. 
#We're going to compute the number of days that lapse between 
#January 1st, 2016 and the date of purchase in the data. 
data['days_since'] = (time1 - data['date_of_purchase']).dt.days

data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase,year_of_purchase,days_since
0,760,25.0,2009-11-06,2009,2247
1,860,50.0,2012-09-28,2012,1190
2,1200,100.0,2005-10-25,2005,3720
3,1420,50.0,2009-07-09,2009,2367
4,1940,70.0,2013-01-25,2013,1071


# Data Preparation

We are going to analysis the Customers based on below 3 factors:
- R (Recency): Number of days since last purchase
- F (Frequency): Number of tracsactions
- M (Monetary): Total amount of transactions (revenue contributed)

In [367]:
# Importing library pandassql
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [368]:
customers_2015 = pysqldf("select customers_id, min(days_since) as 'recêncy', max(days_since) as 'first_purchase', count(*) as 'frequency', avg(purchase_amount) as 'amount' from data group by 1")
customers_2015

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount
0,10,3829,3829,1,30.000000
1,80,343,3751,7,71.428571
2,90,758,3783,10,115.800000
3,120,1401,1401,1,20.000000
4,130,2970,3710,2,50.000000
...,...,...,...,...,...
18412,263820,1,1,1,10.000000
18413,263870,135,135,1,50.000000
18414,263880,34,34,1,20.000000
18415,263890,5,5,1,54.000000


In [369]:
customers_2015.to_csv('customers_2015.csv')

# MANAGERIAL SEGMENTATION

In [370]:
# More complex 4-segment solution using which
def recency_lab(customers_2015) :
    
    if customers_2015["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2015["recêncy"] <= 365*3) & (customers_2015["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2015["recêncy"] <= 365*2) & (customers_2015["recêncy"] > 365*1 ):
        return "warm"
    else:
        return 'active'
customers_2015["segment"] = customers_2015.apply(lambda customers_2015:recency_lab(customers_2015),
                                      axis = 1)
customers_2015.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3829,3829,1,30.0,inactive
1,80,343,3751,7,71.428571,active
2,90,758,3783,10,115.8,cold
3,120,1401,1401,1,20.0,inactive
4,130,2970,3710,2,50.0,inactive


In [371]:
# Quantity of NA and Inactive
customers_2015['segment'].value_counts()

inactive    9158
active      5398
warm        1958
cold        1903
Name: segment, dtype: int64

def f(customers_2015):
    if customers_2015["recêncy"] > 365*3 :
        return "inactive"
    elif (customers_2015["recêncy"] <= 365*3) & (customers_2015["recêncy"] >= 365*2 ):
        return "cold"
    elif (customers_2015[customers_2015['segment'] == 'warm']) & (customers_2015["first_purchase"] <= 365*2 ):
        return "new warm"
    elif (customers_2015[customers_2015['segment'] == 'warm']) & (customers_2015["amount"] < 100 ):
        return "warm low value"
    elif (customers_2015[customers_2015['segment'] == 'warm']) & (customers_2015["amount"] >= 100 ):
        return "warm high value"
    elif (customers_2015[customers_2015['segment'] == 'active']) & (customers_2015["first_purchase"] <= 365 ):
        return "new active"
    elif (customers_2015[customers_2015['segment'] == 'active']) & (customers_2015["amount"] < 100 ):
        return "active low value"
    else:
        
        return "active high value"
   

customers_2015['segment'] = customers_2015.apply(f, axis=1)
customers_2015.head()

In [372]:
active = customers_2015[customers_2015['segment'] == 'active']
active.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
1,80,343,3751,7,71.428571,active
17,480,21,3678,12,60.833333,active
29,830,321,3739,7,48.571429,active
30,850,24,3416,10,28.5,active
31,860,237,4008,10,54.0,active


In [373]:
def f(active):
    if active["recêncy"] > 365*3 :
        return "inactive"
    elif (active["recêncy"] <= 365*3) & (active["recêncy"] >= 365*2 ):
        return "cold"
    
    elif active["first_purchase"] <= 365 :
        return "new active"
    elif active["amount"] < 100 :
        return "active low value"
    else:
        return "active high value"
   
active['segment'] = active.apply(f, axis=1)
active.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
1,80,343,3751,7,71.428571,active low value
17,480,21,3678,12,60.833333,active low value
29,830,321,3739,7,48.571429,active low value
30,850,24,3416,10,28.5,active low value
31,860,237,4008,10,54.0,active low value


In [374]:
# Quantity of NA and Inactive
active['segment'].value_counts()

active low value     3313
new active           1512
active high value     573
Name: segment, dtype: int64

In [375]:
warm = customers_2015[customers_2015['segment'] == 'warm']
warm.head()

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
9,240,463,3815,4,16.25,warm
44,1290,367,3689,5,53.0,warm
47,1380,616,3717,4,62.5,warm
48,1410,368,3653,5,270.0,warm
55,1620,588,3745,2,25.0,warm


In [376]:
# Quantity of NA and Inactive
warm['segment'].value_counts()

warm    1958
Name: segment, dtype: int64

In [377]:
def g(warm):
    if warm["recêncy"] > 365*3 :
        return "inactive"
    elif (warm["recêncy"] <= 365*3) & (warm["recêncy"] >= 365*2 ):
        return "cold"
    
    elif warm["first_purchase"] <= 365*2 :
        return "new warm "
    elif warm["amount"] < 100 :
        return "warm low value"
    else:
        return "warm high value"
   
warm['segment'] = warm.apply(g, axis=1)
warm.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
9,240,463,3815,4,16.25,warm low value
44,1290,367,3689,5,53.0,warm low value
47,1380,616,3717,4,62.5,warm low value
48,1410,368,3653,5,270.0,warm high value
55,1620,588,3745,2,25.0,warm low value


In [378]:
# Quantity of NA and Inactive
warm['segment'].value_counts()

new warm           938
warm low value     901
warm high value    119
Name: segment, dtype: int64

In [379]:
subgroup_active_warm = pd.concat([customers_2015,active,warm])
subgroup_active_warm

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,segment
0,10,3829,3829,1,30.000000,inactive
1,80,343,3751,7,71.428571,active
2,90,758,3783,10,115.800000,cold
3,120,1401,1401,1,20.000000,inactive
4,130,2970,3710,2,50.000000,inactive
...,...,...,...,...,...,...
16956,235060,368,368,1,50.000000,new warm
16966,235200,386,386,1,100.000000,new warm
16967,235210,454,454,1,500.000000,new warm
17031,236310,371,371,1,30.000000,new warm


In [380]:
subgroup_active_warm = subgroup_active_warm[subgroup_active_warm.segment != "active"]
subgroup_active_warm = subgroup_active_warm[subgroup_active_warm.segment != "warm"]

In [381]:
customers_2015 = subgroup_active_warm.copy()
customers_2015['segment'].value_counts()

inactive             9158
active low value     3313
cold                 1903
new active           1512
new warm              938
warm low value        901
active high value     573
warm high value       119
Name: segment, dtype: int64