In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster

## Load and Visualize Dataset

In [2]:
df = pd.read_csv(
    'purchases.txt',
    header=None,sep='\t',
    parse_dates=[2],
    names=['customer_id','purchase_amount','date_of_purchase'])
df['year_of_purchase'] = df.date_of_purchase.dt.year
df.head(5)

Unnamed: 0,customer_id,purchase_amount,date_of_purchase,year_of_purchase
0,760,25.0,2009-11-06,2009
1,860,50.0,2012-09-28,2012
2,1200,100.0,2005-10-25,2005
3,1420,50.0,2009-07-09,2009
4,1940,70.0,2013-01-25,2013


### Summarize Purchase Amounts by Years

In [3]:
dfg = pd.pivot_table(
    df,
    index='year_of_purchase',
    values=['purchase_amount'],
    aggfunc={'purchase_amount' :[np.size, np.average, np.sum]})
dfg

Unnamed: 0_level_0,purchase_amount,purchase_amount,purchase_amount
Unnamed: 0_level_1,size,average,sum
year_of_purchase,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2005,1470.0,55.826122,82064.4
2006,2182.0,52.250454,114010.49
2007,4674.0,49.264014,230260.0
2008,4331.0,53.07178,229853.88
2009,5054.0,50.745253,256466.51
2010,4939.0,58.740012,290116.92
2011,4785.0,63.5193,303939.85
2012,5960.0,62.913312,374963.34
2013,5912.0,67.931357,401610.18
2014,5739.0,75.390347,432665.2


In [4]:
dfg = dfg['purchase_amount'].rename(columns={'size':'counter','average':'avg_amount','sum' : 'sum_amount'})
dfg

Unnamed: 0_level_0,counter,avg_amount,sum_amount
year_of_purchase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,1470.0,55.826122,82064.4
2006,2182.0,52.250454,114010.49
2007,4674.0,49.264014,230260.0
2008,4331.0,53.07178,229853.88
2009,5054.0,50.745253,256466.51
2010,4939.0,58.740012,290116.92
2011,4785.0,63.5193,303939.85
2012,5960.0,62.913312,374963.34
2013,5912.0,67.931357,401610.18
2014,5739.0,75.390347,432665.2


In [5]:
dfg.plot(kind='bar',subplots=True,figsize=(8,10),title='Purchase Statistics by Year')

<IPython.core.display.Javascript object>

array([<matplotlib.axes._subplots.AxesSubplot object at 0x113fc4f60>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1152e1b00>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1154dc198>], dtype=object)

## RFM Segmentation

### Recency Frequency Money Calculations

In [6]:
df['days_since'] = (df.date_of_purchase.max() - df.date_of_purchase).dt.days + 1
df.days_since.describe()

count    51243.000000
mean      1631.939309
std       1061.076889
min          1.000000
25%        733.000000
50%       1500.000000
75%       2540.000000
max       4016.000000
Name: days_since, dtype: float64

In [7]:
g = df.groupby(by='customer_id')
dfg = pd.DataFrame({
        'recency' : g.days_since.min(), 
        'frequency' : g.days_since.count(), 
        'amount' : g.purchase_amount.mean()})
dfg = dfg[['recency','frequency','amount']]
dfg.head()

Unnamed: 0_level_0,recency,frequency,amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,3829,1,30.0
80,343,7,71.428571
90,758,10,115.8
120,1401,1,20.0
130,2970,2,50.0


In [8]:
dfg.describe()

Unnamed: 0,recency,frequency,amount
count,18417.0,18417.0,18417.0
mean,1253.0379,2.782375,57.792985
std,1081.437868,2.936888,154.360109
min,1.0,1.0,5.0
25%,244.0,1.0,21.666667
50%,1070.0,2.0,30.0
75%,2130.0,3.0,50.0
max,4014.0,45.0,4500.0


In [9]:
dfg.hist('recency')

<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x117938cc0>]], dtype=object)

In [10]:
dfg.hist('frequency')

<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x117ad2908>]], dtype=object)

In [11]:
dfg.hist(column='amount',bins=100)


<IPython.core.display.Javascript object>

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x117b494a8>]], dtype=object)

### Standard Scaling and Dispersion Adjustments

In [12]:
scaler = StandardScaler()
X = dfg.copy()
X['amount'] = np.log(X.amount)
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns=X.columns,index=dfg.index)
X_scaled.describe()

Unnamed: 0,recency,frequency,amount
count,18417.0,18417.0,18417.0
mean,-5.34185e-15,2.472511e-15,-1.983777e-15
std,1.000027,1.000027,1.000027
min,-1.157784,-0.6069088,-2.570124
25%,-0.9330775,-0.6069088,-0.6597654
50%,-0.1692588,-0.2664031,-0.2358019
75%,0.8109443,0.07410256,0.4297068
max,2.553117,14.37534,6.292104


### Reduce Dataset by Resampling

In [13]:
# sample = resample(X_scaled.index,replace=False,n_samples=np.int(X_scaled.index.size/10),random_state=10)
sample = X_scaled.index[np.arange(0,X_scaled.index.size,10)]
X_sample = X.loc[sample].copy()
X_sample.describe()

Unnamed: 0,recency,frequency,amount
count,1842.0,1842.0,1842.0
mean,1260.163952,2.80456,3.566935
std,1085.954407,2.943026,0.74931
min,1.0,1.0,1.609438
25%,239.0,1.0,3.075775
50%,1096.0,2.0,3.401197
75%,2123.0,3.0,3.912023
max,3908.0,28.0,7.703137


In [14]:
X_scaled_sample = X_scaled.loc[sample].copy()
X_scaled_sample.describe()

Unnamed: 0,recency,frequency,amount
count,1842.0,1842.0,1842.0
mean,0.00659,0.007554,-0.019878
std,1.004204,1.002117,0.976208
min,-1.157784,-0.606909,-2.570124
25%,-0.937701,-0.606909,-0.659765
50%,-0.145216,-0.266403,-0.235802
75%,0.804471,0.074103,0.429707
max,2.455096,8.586745,5.368807


### Hierarchical Cluster

In [15]:
# https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
Z = linkage(X_scaled_sample,'ward')

# calculate full dendrogram
plt.figure(figsize=(12, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',
    p=30,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.axhline(y=16,c='k')
plt.show()

<IPython.core.display.Javascript object>

In [16]:
k=9
members = fcluster(Z, k, criterion='maxclust')
X_sample = dfg.loc[sample]
X_sample['cluster'] = members
X_cluster = pd.pivot_table(X_sample,index=['cluster'],aggfunc=[np.mean])
X_cluster['count'] = X_sample.groupby(by='cluster').amount.size()
X_cluster[['count','mean']]

Unnamed: 0_level_0,count,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,amount,frequency,recency
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,212,16.263365,1.311321,2684.292453
2,309,37.784736,1.28479,2563.236246
3,130,42.015211,10.615385,193.646154
4,306,41.106405,2.398693,162.104575
5,317,22.43357,1.280757,922.829653
6,236,29.854206,4.466102,1143.694915
7,83,261.219139,6.578313,448.578313
8,49,214.861973,1.857143,2567.040816
9,200,87.429167,1.4,799.34


In [17]:
X_cluster.plot.bar(subplots=True,title='Recency Frequency Money Stats by Cluster',figsize=(10,15))

<IPython.core.display.Javascript object>

array([<matplotlib.axes._subplots.AxesSubplot object at 0x119d4efd0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x11a01a9e8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x11a062eb8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x11a09e6a0>], dtype=object)

## Mangerial Segmentation

In [18]:
df['days_since'] = (df.date_of_purchase.max() - df.date_of_purchase).dt.days + 1
g = df.groupby(by="customer_id")
customers_2015 = pd.DataFrame({
    'recency' : g.days_since.min(), 
    'first_purchase' : g.days_since.max(),
    'frequency' : g.days_since.count(), 
    'amount' : g.purchase_amount.mean()}, 
    columns= ['recency','first_purchase','frequency','amount']
    )

In [19]:
customers_2015.describe()

Unnamed: 0,recency,first_purchase,frequency,amount
count,18417.0,18417.0,18417.0,18417.0
mean,1253.0379,1984.009882,2.782375,57.792985
std,1081.437868,1133.405441,2.936888,154.360109
min,1.0,1.0,1.0,5.0
25%,244.0,988.0,1.0,21.666667
50%,1070.0,2087.0,2.0,30.0
75%,2130.0,2992.0,3.0,50.0
max,4014.0,4016.0,45.0,4500.0


### Simple 2-segment solution based upon recency alone

In [20]:
customers_2015['segment'] = np.where(customers_2015.recency > 365*3, 'inactive', 'NA')
customers_2015.segment.value_counts()

NA          9259
inactive    9158
Name: segment, dtype: int64

In [21]:
customers_2015.groupby(by='segment').mean()

Unnamed: 0_level_0,recency,first_purchase,frequency,amount
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,338.055946,1427.983584,3.739713,67.367605
inactive,2178.110832,2546.168377,1.814479,48.112771


### Simple multi-segment segmentation

In [22]:
customers_2015['segment'] = 'NA'
customers_2015.loc[customers_2015.recency > 365*3,'segment'] = 'inactive'
customers_2015.loc[(customers_2015.recency <= 365*3) & (customers_2015.recency > 365*2),'segment'] = 'cold'
customers_2015.loc[(customers_2015.recency <= 365*2) & (customers_2015.recency > 365*1),'segment'] = 'warm'
customers_2015.loc[customers_2015.recency <= 365*1,'segment'] = 'active'
customers_2015.segment.value_counts()

inactive    9158
active      5398
warm        1958
cold        1903
Name: segment, dtype: int64

In [23]:
customers_2015.groupby(by='segment').mean()

Unnamed: 0_level_0,recency,first_purchase,frequency,amount
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
active,99.740645,1465.843461,4.560763,72.080944
cold,857.781398,1432.117183,2.303205,51.739893
inactive,2178.110832,2546.168377,1.814479,48.112771
warm,489.939734,1319.590398,2.872319,69.562155


### Actual segmentation

In [24]:
def segment(customers):
    customers['segment'] = 'NA'
    # segment by recency
    customers.loc[customers.recency > 365*3,'segment'] = 'inactive'
    customers.loc[(customers.recency <= 365*3) & (customers.recency > 365*2),'segment'] = 'cold'
    customers.loc[(customers.recency <= 365*2) & (customers.recency > 365*1),'segment'] = 'warm'
    customers.loc[customers.recency <= 365*1,'segment'] = 'active'
    # segment warm into (new, low value, high value)
    customers.loc[(customers.segment == 'warm') & (customers.first_purchase <= 365*2),'segment'] = 'new warm'
    customers.loc[(customers.segment == 'warm') & (customers.amount < 100),'segment'] = 'warm low value'
    customers.loc[(customers.segment == 'warm') & (customers.amount >= 100),'segment'] = 'warm high value'
    # segment active into (new, low value, high value)
    customers.loc[(customers.segment == 'active') & (customers.first_purchase <= 365),'segment'] = 'new active'
    customers.loc[(customers.segment == 'active') & (customers.amount < 100),'segment'] = 'active low value'
    customers.loc[(customers.segment == 'active') & (customers.amount >= 100),'segment'] = 'active high value'
    customers.segment = order_segments(customers.segment)
    customers
    return customers

def order_segments(s):
    return s.astype("category", categories=[
        "inactive",
        "cold",
        "warm high value", "warm low value","new warm",
        "active high value", "active low value", "new active"
        ],ordered=True)

In [25]:
segment(customers_2015)
customers_2015.groupby(by='segment').segment.count()

segment
inactive             9158
cold                 1903
warm high value       119
warm low value        901
new warm              938
active high value     573
active low value     3313
new active           1512
Name: segment, dtype: int64

In [26]:
customers_2015.groupby(by='segment').mean()

Unnamed: 0_level_0,recency,first_purchase,frequency,amount
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
inactive,2178.110832,2546.168377,1.814479,48.112771
cold,857.781398,1432.117183,2.303205,51.739893
warm high value,455.12605,2015.352941,4.714286,327.407457
warm low value,474.377358,2063.63929,4.531632,38.591926
new warm,509.304904,516.622601,1.044776,66.599026
active high value,88.820244,1985.90925,5.888307,240.04574
active low value,108.361002,2003.801992,5.935406,40.724525
new active,84.990741,90.013889,1.045635,77.133847


## Retrospective Segmentation

In [27]:
years = np.sort(df.date_of_purchase.dt.year.unique())
retro_dates = pd.to_datetime(years + 1,format="%Y")
retro_dates

DatetimeIndex(['2006-01-01', '2007-01-01', '2008-01-01', '2009-01-01',
               '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01',
               '2014-01-01', '2015-01-01', '2016-01-01'],
              dtype='datetime64[ns]', freq=None)

In [28]:
def retro(sales_data,retrospective_date):
    retro_sales_data = sales_data[sales_data.date_of_purchase < retrospective_date].copy()
    retro_sales_data['days_since'] = (retrospective_date - retro_sales_data.date_of_purchase).dt.days
    return retro_sales_data

def rfm(sales_data,year=None):
    g = sales_data.groupby(by='customer_id')
    df = pd.DataFrame({
        'recency' : g.days_since.min(), 
        'first_purchase' : g.days_since.max(),
        'frequency' : g.days_since.count(), 
        'amount' : g.purchase_amount.mean()}, 
        columns= ['recency','first_purchase','frequency','amount'])
    if year:
        df['year'] = year
    return df

In [29]:
retro(df,pd.to_datetime('2010')).head()

Unnamed: 0,customer_id,purchase_amount,date_of_purchase,year_of_purchase,days_since
0,760,25.0,2009-11-06,2009,56
2,1200,100.0,2005-10-25,2005,1529
3,1420,50.0,2009-07-09,2009,176
6,2620,30.0,2006-03-09,2006,1394
7,3050,50.0,2007-04-12,2007,995


In [30]:
rfm(retro(df,pd.to_datetime('2010'))).head()

Unnamed: 0_level_0,recency,first_purchase,frequency,amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,1638,1638,1,30.0
80,266,1560,3,60.0
90,72,1592,5,135.6
130,779,1519,2,50.0
160,772,1386,2,30.0


In [31]:
customers = pd.concat([rfm(retro(df,date),date.year - 1) for date in retro_dates])
customers.set_index('year',append=True,inplace=True)
customers.sort_index(inplace=True)
customers.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,recency,first_purchase,frequency,amount
customer_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,2005,177,177,1,30.0
10,2006,542,542,1,30.0
10,2007,907,907,1,30.0
10,2008,1273,1273,1,30.0
10,2009,1638,1638,1,30.0
10,2010,2003,2003,1,30.0
10,2011,2368,2368,1,30.0
10,2012,2734,2734,1,30.0
10,2013,3099,3099,1,30.0
10,2014,3464,3464,1,30.0


In [32]:
customers = segment(customers)
customers.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,recency,first_purchase,frequency,amount,segment
customer_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,2005,177,177,1,30.0,new active
10,2006,542,542,1,30.0,new warm
10,2007,907,907,1,30.0,cold
10,2008,1273,1273,1,30.0,inactive
10,2009,1638,1638,1,30.0,inactive
10,2010,2003,2003,1,30.0,inactive
10,2011,2368,2368,1,30.0,inactive
10,2012,2734,2734,1,30.0,inactive
10,2013,3099,3099,1,30.0,inactive
10,2014,3464,3464,1,30.0,inactive


In [33]:
customers.segment.unstack('year').head(6)

year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10,new active,new warm,cold,inactive,inactive,inactive,inactive,inactive,inactive,inactive,inactive
80,new active,new warm,active low value,warm low value,active low value,warm low value,active low value,warm low value,active low value,active low value,active low value
90,new active,active high value,active high value,active high value,active high value,active high value,active high value,active high value,active high value,warm high value,cold
120,,,,,,,,new active,new warm,cold,inactive
130,new active,new warm,active low value,warm low value,cold,inactive,inactive,inactive,inactive,inactive,inactive
160,,new active,active low value,warm low value,cold,inactive,inactive,inactive,inactive,inactive,inactive


In [34]:
customers.xs(120,level=0)

Unnamed: 0_level_0,recency,first_purchase,frequency,amount,segment
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012,306,306,1,20.0,new active
2013,671,671,1,20.0,new warm
2014,1036,1036,1,20.0,cold
2015,1401,1401,1,20.0,inactive


In [35]:
idx = pd.IndexSlice
customers.loc[idx[:,2014],:].groupby(by='segment').mean()

Unnamed: 0_level_0,recency,first_purchase,frequency,amount
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
inactive,2058.441294,2353.036874,1.730964,48.111199
cold,866.616814,1565.444032,2.254064,51.114605
warm high value,461.198198,1878.810811,4.414414,187.84911
warm low value,470.661088,1945.426778,4.361925,37.38206
new warm,497.3176,504.88,1.0576,51.367653
active high value,85.338947,1838.050526,5.696842,261.902155
active low value,98.091,1796.351378,5.633677,40.459174
new active,132.086987,142.87126,1.070981,69.725159


In [36]:
segmentation_by_year = customers.segment.unstack('year')
cust_segs_2014 = segmentation_by_year[2014]
cust_segs_2015 = segmentation_by_year[2015]
pd.crosstab(index=cust_segs_2014,columns=cust_segs_2015,margins=True)

2015,inactive,cold,warm high value,warm low value,new warm,active high value,active low value,new active,All
2014,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
inactive,7227,0,0,0,0,35,250,0,7512
cold,1931,0,0,0,0,22,200,0,2153
warm high value,0,75,0,0,0,35,1,0,111
warm low value,0,689,0,0,0,1,266,0,956
new warm,0,1139,0,0,0,15,96,0,1250
active high value,0,0,119,0,0,354,2,0,475
active low value,0,0,0,901,0,22,2088,0,3011
new active,0,0,0,0,938,89,410,0,1437
All,9158,1903,119,901,938,573,3313,0,16905


In [37]:
ct = pd.crosstab(index=cust_segs_2014,columns=cust_segs_2015,normalize='all',margins=True)
ct

2015,inactive,cold,warm high value,warm low value,new warm,active high value,active low value,new active,All
2014,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
inactive,0.427507,0.0,0.0,0.0,0.0,0.00207,0.014789,0.0,0.444366
cold,0.114227,0.0,0.0,0.0,0.0,0.001301,0.011831,0.0,0.127359
warm high value,0.0,0.004437,0.0,0.0,0.0,0.00207,5.9e-05,0.0,0.006566
warm low value,0.0,0.040757,0.0,0.0,0.0,5.9e-05,0.015735,0.0,0.056551
new warm,0.0,0.067377,0.0,0.0,0.0,0.000887,0.005679,0.0,0.073943
active high value,0.0,0.0,0.007039,0.0,0.0,0.020941,0.000118,0.0,0.028098
active low value,0.0,0.0,0.0,0.053298,0.0,0.001301,0.123514,0.0,0.178113
new active,0.0,0.0,0.0,0.0,0.055487,0.005265,0.024253,0.0,0.085004
All,0.541733,0.11257,0.007039,0.053298,0.055487,0.033895,0.195978,0.0,1.0


In [38]:
from matplotlib.ticker import FuncFormatter
plt.figure().subplots_adjust(bottom=.3)
formatter = FuncFormatter(lambda y, pos: str(100*y) + ' %')
plt.gca().yaxis.set_major_formatter(formatter)
ct.All[:-1].plot(kind='bar',ylim=[0.0,1.0],title="Customer Segmentation 2014")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x114c699b0>

### Segments and Revenue Generation

In [39]:
revenue = df.groupby(by=['customer_id','year_of_purchase']).purchase_amount.sum()
revenue.name = "revenue"
revenue.index.set_names(['customer_id','year'],inplace=True)
if 'revenue' in customers.columns:
    customers.drop(['revenue'],axis=1,inplace=True)
customers = customers.join(revenue)
customers.revenue.fillna(0,inplace=True)
customers.head(22)

Unnamed: 0_level_0,Unnamed: 1_level_0,recency,first_purchase,frequency,amount,segment,revenue
customer_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,2005,177,177,1,30.0,new active,30.0
10,2006,542,542,1,30.0,new warm,0.0
10,2007,907,907,1,30.0,cold,0.0
10,2008,1273,1273,1,30.0,inactive,0.0
10,2009,1638,1638,1,30.0,inactive,0.0
10,2010,2003,2003,1,30.0,inactive,0.0
10,2011,2368,2368,1,30.0,inactive,0.0
10,2012,2734,2734,1,30.0,inactive,0.0
10,2013,3099,3099,1,30.0,inactive,0.0
10,2014,3464,3464,1,30.0,inactive,0.0


#### Revenue in 2015 by Segment

In [40]:
customers_2015 = customers.loc[idx[:,2015],:]
customers_2015.groupby(by=['segment']).revenue.mean()

segment
inactive               0.000000
cold                   0.000000
warm high value        0.000000
warm low value         0.000000
new warm               0.000000
active high value    323.568935
active low value      52.306043
new active            79.166144
Name: revenue, dtype: float64

#### Projected Average Revenue By Segement

In [41]:
next_years_revenue = customers.revenue.groupby(level='customer_id').shift(-1)
next_years_revenue.name = 'next_years_revenue'
if 'next_years_revenue' in customers.columns:
    customers.drop(['next_years_revenue'],axis=1,inplace=True)
customers = customers.join(next_years_revenue)

In [42]:
customers_2014 = customers.loc[idx[:,2014],:]
expected_revenue = customers_2014.groupby(by='segment').next_years_revenue.mean()
expected_revenue

segment
inactive               2.949466
cold                   6.108221
warm high value      114.459459
warm low value        13.494770
new warm               5.064000
active high value    254.077895
active low value      41.896556
new active            31.046625
Name: next_years_revenue, dtype: float64

In [43]:
expected_revenue.sort_values(ascending=False)

segment
active high value    254.077895
warm high value      114.459459
active low value      41.896556
new active            31.046625
warm low value        13.494770
cold                   6.108221
new warm               5.064000
inactive               2.949466
Name: next_years_revenue, dtype: float64

In [44]:
plt.figure().subplots_adjust(bottom=.4)
formatter = FuncFormatter(lambda y, pos: '$ ' + str(y))
plt.gca().yaxis.set_major_formatter(formatter)
ax = expected_revenue.sort_values(ascending=False).plot.bar(title="Projected Average Revenue By Customer Segment")
ax.set_ylabel("projected average revenue")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x11a833198>

# Targeting and Scoring Models