In [6]:
import boto3
import pandas as pd

client = boto3.client('s3') # s3 bucket. 

In [11]:
obj = client.get_object(Bucket='info7374s3alycefinalproject', Key='FinalAlyce_facts_ML.csv')
df = pd.read_csv(obj['Body'])

In [12]:
df.head()

Unnamed: 0,order_id,client_id,service_id,gift_id,gift_cost,total_gifts,total_amount,fee,variable_cost,total_revenue,date
0,11110,419,1,16,1290,13,16764,3353,4326,20117,6/10/2019
1,11111,503,3,21,236,50,11783,2357,5976,14140,6/16/2019
2,11112,395,2,23,212,71,15050,3010,4836,18060,9/9/2018
3,11113,292,1,22,445,38,16919,3384,6144,20303,6/15/2019
4,11114,408,1,14,1248,11,13727,2745,5632,16472,2/22/2018


In [13]:
import datetime as dt
df['Date'] = pd.to_datetime(df['date']).dt.date

In [17]:
df = df[pd.notnull(df['client_id'])]
df = df[(df['total_gifts']>0)]

In [18]:
df['Sales'] = df['total_revenue']
cols_of_interest = ['client_id', 'Date', 'Sales']
df = df[cols_of_interest]

In [19]:
print(df.head())
print(df['client_id'].nunique())

   client_id        Date  Sales
0        419  2019-06-10  20117
1        503  2019-06-16  14140
2        395  2018-09-09  18060
3        292  2019-06-15  20303
4        408  2018-02-22  16472
421


In [26]:
from lifetimes.plotting import *
from lifetimes.utils import *

data = summary_data_from_transaction_data(df, 'client_id', 'Date', monetary_value_col='Sales', observation_period_end='2019-12-1')
data.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111,9.0,644.0,682.0,17225.0
112,5.0,606.0,619.0,16560.8
113,9.0,615.0,615.0,17261.666667
114,6.0,615.0,678.0,19652.833333
115,7.0,664.0,693.0,17169.428571


In [27]:
print(data['frequency'].describe())
print(sum(data['frequency'] == 0)/float(len(data)))

count    421.000000
mean       8.047506
std        3.087243
min        1.000000
25%        6.000000
50%        8.000000
75%       10.000000
max       19.000000
Name: frequency, dtype: float64
0.0


In [28]:
from lifetimes import BetaGeoFitter

# similar API to scikit-learn and lifelines.
bgf = BetaGeoFitter(penalizer_coef=0.1)
bgf.fit(data['frequency'], data['recency'], data['T'])
print(bgf)

<lifetimes.BetaGeoFitter: fitted with 421 subjects, a: 0.00, alpha: 108.42, b: 0.00, r: 1.40>


In [29]:
t = 1
data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])
data.sort_values(by='predicted_purchases').tail(5)

Unnamed: 0_level_0,frequency,recency,T,monetary_value,predicted_purchases
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
131,15.0,429.0,552.0,17106.266667,0.024838
476,14.0,508.0,510.0,18400.0,0.024908
359,17.0,583.0,626.0,18154.588235,0.025059
289,19.0,617.0,684.0,18309.631579,0.025748
519,17.0,539.0,553.0,19076.294118,0.027824


In [30]:
from lifetimes.utils import calibration_and_holdout_data

summary_cal_holdout = calibration_and_holdout_data(df, 'client_id', 'Date',
                                        calibration_period_end='2019-06-01',
                                        observation_period_end='2019-12-01' )   
print(summary_cal_holdout.head())

           frequency_cal  recency_cal  T_cal  frequency_holdout  \
client_id                                                         
111                  3.0        424.0  499.0                6.0   
112                  4.0        274.0  436.0                1.0   
113                  3.0        421.0  432.0                6.0   
114                  4.0        433.0  495.0                2.0   
115                  4.0        393.0  510.0                3.0   

           duration_holdout  
client_id                    
111                     183  
112                     183  
113                     183  
114                     183  
115                     183  


In [31]:
t = 10 #predict purchases in 10 periods
individual = data.loc[359]
# The below function is an alias to `bfg.conditional_expected_number_of_purchases_up_to_time`
bgf.predict(t, individual['frequency'], individual['recency'], individual['T'])

0.25058707054398105

In [32]:
data.loc[359]

frequency                 17.000000
recency                  583.000000
T                        626.000000
monetary_value         18154.588235
predicted_purchases        0.025059
Name: 359, dtype: float64

In [33]:
returning_customers_summary = data[data['frequency']>0]

print(returning_customers_summary.head())
print(len(returning_customers_summary))

           frequency  recency      T  monetary_value  predicted_purchases
client_id                                                                
111              9.0    644.0  682.0    17225.000000             0.013162
112              5.0    606.0  619.0    16560.800000             0.008803
113              9.0    615.0  615.0    17261.666667             0.014381
114              6.0    615.0  678.0    19652.833333             0.009414
115              7.0    664.0  693.0    17169.428571             0.010486
421


In [34]:
returning_customers_summary[['monetary_value', 'frequency']].corr()

Unnamed: 0,monetary_value,frequency
monetary_value,1.0,0.015748
frequency,0.015748,1.0


In [36]:
from lifetimes import GammaGammaFitter

ggf = GammaGammaFitter(penalizer_coef = 0.1)
ggf.fit(returning_customers_summary['frequency'],
        returning_customers_summary['monetary_value'])
print(ggf)

<lifetimes.GammaGammaFitter: fitted with 421 subjects, p: 0.74, q: 0.10, v: 0.70>


In [37]:
print(ggf.conditional_expected_average_profit(
        data['frequency'],
        data['monetary_value']
    ).head(10))

client_id
111    19937.302320
112    21931.180870
113    19979.742442
114    24691.389229
115    20809.168714
116    24139.800824
117    20312.151182
118    25952.990910
119    20694.376656
120    21052.045365
dtype: float64


In [38]:
bgf.fit(data['frequency'], data['recency'], data['T'])

print(ggf.customer_lifetime_value(
    bgf, #the model to use to predict the number of future transactions
    data['frequency'],
    data['recency'],
    data['T'],
    data['monetary_value'],
    time=12, # months
    discount_rate=0.01 # monthly discount rate ~ 12.7% annually
).head(10))

client_id
111     88605.398785
112     65187.841891
113     97017.778035
114     78487.460084
115     73676.153403
116     70302.988616
117    119074.589903
118     67798.237655
119     73729.721964
120    120844.571995
Name: clv, dtype: float64


In [55]:
da = ggf.conditional_expected_average_profit(data['frequency'],data['monetary_value']).to_frame()

da.head()
type(da)

pandas.core.frame.DataFrame

In [None]:
s3.meta.client.upload_file('/tmp/hello.txt', 'mybucket', 'hello.txt')

In [56]:
da.head()

Unnamed: 0_level_0,0
client_id,Unnamed: 1_level_1
111,19937.30232
112,21931.18087
113,19979.742442
114,24691.389229
115,20809.168714


In [57]:
from io import StringIO # python3; python2: BytesIO 
import boto3

In [58]:

csv_buffer = StringIO()
da.to_csv(csv_buffer)

In [60]:

s3_resource = boto3.resource('s3')
s3_resource.Object('info7374s3alycefinalproject', 'ltv.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'B9DD013CB6DA3458',
  'HostId': '+mEk/pl5/+WMC0W3Qeo9Yc/9qSgYCt91iu3ZoRfVgDBeRS2XyCYwvpb8MlkET2RRFFgrdBIR1WQ=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+mEk/pl5/+WMC0W3Qeo9Yc/9qSgYCt91iu3ZoRfVgDBeRS2XyCYwvpb8MlkET2RRFFgrdBIR1WQ=',
   'x-amz-request-id': 'B9DD013CB6DA3458',
   'date': 'Fri, 16 Aug 2019 00:35:03 GMT',
   'etag': '"414ef062aa6e9e01e1eb7443bdf2288f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"414ef062aa6e9e01e1eb7443bdf2288f"'}