In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scoring Customers

## Prepare Data For Scoring

In [90]:
def load_sales_data():
    df = pd.read_csv(
    'purchases.txt',
    header=None,sep='\t',
    parse_dates=[2],
    names=['customer_id','purchase_amount','date_of_purchase'])
    df['year_of_purchase'] = df.date_of_purchase.dt.year
    return df

def retro(sales_data,retrospective_date):
    retro_date = pd.to_datetime(retrospective_date)
    retro_sales_data = sales_data[sales_data.date_of_purchase < retro_date].copy()
    retro_sales_data['days_since'] = (retro_date - retro_sales_data.date_of_purchase).dt.days
    return retro_sales_data

def rfm(sales_df,year=None):
    g = sales_df.groupby(by='customer_id')
    df = pd.DataFrame({
        'recency' : g.days_since.min(), 
        'first_purchase' : g.days_since.max(),
        'frequency' : g.days_since.count(), 
        'avg_amount' : g.purchase_amount.mean(),
        'max_amount' : g.purchase_amount.max()}, 
        columns= ['recency','first_purchase','frequency','avg_amount','max_amount'])
    if year:
        df['year'] = year
    return df

def rfm_panel(sales_df):
    retrospective_dates = pd.to_datetime(np.sort(sales_df.date_of_purchase.dt.year.unique()) + 1,format='%Y')
    customers = pd.concat([rfm(retro(sales_df,date),date.year -1) for date in retrospective_dates])
    customers.set_index('year',append=True,inplace=True)
    customers.sort_index(inplace=True)
    return customers

In [51]:
sales_df = load_sales_data()
sales_df.head(6)

Unnamed: 0,customer_id,purchase_amount,date_of_purchase,year_of_purchase
0,760,25.0,2009-11-06,2009
1,860,50.0,2012-09-28,2012
2,1200,100.0,2005-10-25,2005
3,1420,50.0,2009-07-09,2009
4,1940,70.0,2013-01-25,2013
5,1960,40.0,2013-10-29,2013


In [91]:
customers = rfm_panel(sales_df)
customers.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,recency,first_purchase,frequency,avg_amount,max_amount
customer_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,2005,177,177,1,30.0,30.0
10,2006,542,542,1,30.0,30.0
10,2007,907,907,1,30.0,30.0
10,2008,1273,1273,1,30.0,30.0
10,2009,1638,1638,1,30.0,30.0
10,2010,2003,2003,1,30.0,30.0
10,2011,2368,2368,1,30.0,30.0
10,2012,2734,2734,1,30.0,30.0
10,2013,3099,3099,1,30.0,30.0
10,2014,3464,3464,1,30.0,30.0


In [196]:
g = sales_df.groupby(by=['customer_id','year_of_purchase'])
revenues = pd.DataFrame({'revenue' :g.purchase_amount.sum()})
revenues.index.set_names(['customer_id','year'],inplace=True)
if 'revenue' in customers.columns:
    customers.drop('revenue',axis=1,inplace=True)
customers = customers.join(revenues)
customers.revenue.fillna(0,inplace=True)
customers['next_year_revenue'] = customers.groupby(level='customer_id').revenue.shift(-1)
customers['next_year_active'] = np.NaN
customers.loc[customers.next_year_revenue > 0,'next_year_active'] = 1
customers.loc[customers.next_year_revenue == 0,'next_year_active'] = 0
customers.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,recency,first_purchase,frequency,avg_amount,max_amount,next_year_revenue,next_year_active,revenue
customer_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10,2005,177,177,1,30.0,30.0,0.0,0.0,30.0
10,2006,542,542,1,30.0,30.0,0.0,0.0,0.0
10,2007,907,907,1,30.0,30.0,0.0,0.0,0.0
10,2008,1273,1273,1,30.0,30.0,0.0,0.0,0.0
10,2009,1638,1638,1,30.0,30.0,0.0,0.0,0.0
10,2010,2003,2003,1,30.0,30.0,0.0,0.0,0.0
10,2011,2368,2368,1,30.0,30.0,0.0,0.0,0.0
10,2012,2734,2734,1,30.0,30.0,0.0,0.0,0.0
10,2013,3099,3099,1,30.0,30.0,0.0,0.0,0.0
10,2014,3464,3464,1,30.0,30.0,0.0,0.0,0.0
