<img src='predict.jpg' />

In [45]:
# Import Libraries
import pandas as pd
import numpy as np
import datetime as dt
# Importing library pandassql
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [46]:
# Loading Data
data = pd.read_csv('purchases.txt', sep='\t', encoding='utf-8',header=None,  
                       usecols=[0, 1, 2], names=["customers_id", "purchase_amount", "date_of_purchase"], parse_dates=['date_of_purchase'])
data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase
0,760,25.0,2009-11-06
1,860,50.0,2012-09-28
2,1200,100.0,2005-10-25
3,1420,50.0,2009-07-09
4,1940,70.0,2013-01-25


In [47]:
# Extracting year of column " year_of_purchase "
data['year_of_purchase'] = data['date_of_purchase'].dt.year

# converting to datetime
time1 = pd.Timestamp('2016-01-01')

# And then we're going to compute something a bit specific. 
#We're going to compute the number of days that lapse between 
#January 1st, 2016 and the date of purchase in the data. 
data['days_since'] = (time1 - data['date_of_purchase']).dt.days

data.head()

Unnamed: 0,customers_id,purchase_amount,date_of_purchase,year_of_purchase,days_since
0,760,25.0,2009-11-06,2009,2247
1,860,50.0,2012-09-28,2012,1190
2,1200,100.0,2005-10-25,2005,3720
3,1420,50.0,2009-07-09,2009,2367
4,1940,70.0,2013-01-25,2013,1071


# Data Preparation

We are going to analysis the Customers based on below 3 factors:
- R (Recency): Number of days since last purchase
- F (Frequency): Number of tracsactions
- M (Monetary): Total amount of transactions (revenue contributed)

In [48]:
# Compute RFM variables as of a year ago
customers_2014 = pysqldf("select customers_id, min(days_since) - 365 as 'recêncy', max(days_since) - 365 as 'first_purchase', count(*) as 'frequency', avg(purchase_amount) as 'amount', max(purchase_amount) as 'max_amount' from data where days_since > 365 group by 1")
customers_2014

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,max_amount
0,10,3464,3464,1,30.0,30.0
1,80,302,3386,6,70.0,80.0
2,90,393,3418,10,115.8,153.0
3,120,1036,1036,1,20.0,20.0
4,130,2605,3345,2,50.0,60.0
...,...,...,...,...,...,...
16900,235200,21,21,1,100.0,100.0
16901,235210,89,89,1,500.0,500.0
16902,236310,6,6,1,30.0,30.0
16903,236660,390,684,2,75.0,100.0


In [49]:
# Compute revenues generated by customers in 2015
revenue_2015 = pysqldf("select customers_id, sum(purchase_amount) as 'revenue_2015' from data where year_of_purchase = 2015 group by 1")
revenue_2015

Unnamed: 0,customers_id,revenue_2015
0,80,80.0
1,480,45.0
2,830,50.0
3,850,60.0
4,860,60.0
...,...,...
5393,263820,10.0
5394,263870,50.0
5395,263880,20.0
5396,263890,54.0


In [57]:
# Merge 2014 customers and 2015 revenue
in_sample = pd.merge(customers_2014, revenue_2015, on='customers_id',how="left")

# Replace values "NaN" by " 0 "
in_sample['revenue_2015'] = in_sample['revenue_2015'].replace(np.nan , 0)

# Creating variable active 2015 If it's above zero, it's a yes. If it's zero, it's a no
in_sample['active_2015'] = in_sample['revenue_2015'].map(lambda x : 1 if x > 0 else 0)

# Results
in_sample

Unnamed: 0,customers_id,recêncy,first_purchase,frequency,amount,max_amount,revenue_2015,active_2015
0,10,3464,3464,1,30.0,30.0,0.0,0
1,80,302,3386,6,70.0,80.0,80.0,1
2,90,393,3418,10,115.8,153.0,0.0,0
3,120,1036,1036,1,20.0,20.0,0.0,0
4,130,2605,3345,2,50.0,60.0,0.0,0
...,...,...,...,...,...,...,...,...
16900,235200,21,21,1,100.0,100.0,0.0,0
16901,235210,89,89,1,500.0,500.0,0.0,0
16902,236310,6,6,1,30.0,30.0,0.0,0
16903,236660,390,684,2,75.0,100.0,100.0,1
