In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('customer_purchases.csv')
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


#### Calculate the Customer Life Time Value (CLTV) Using 2 Different methods

1. RFM Method
2. Predictive Modelling


In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

now = df['InvoiceDate'].max()
rfm_df = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (now - x.max()).days,  
    'InvoiceNo': 'count',  
    'TotalPrice': 'sum'  
})

rfm_df.rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
}, inplace=True)

rfm_df['CLTV'] = (rfm_df['Monetary'] / rfm_df['Frequency']) * rfm_df['Recency']

rfm_df.sort_values('CLTV', ascending=False, inplace=True)

rfm_df

Unnamed: 0_level_0,Recency,Frequency,Monetary,CLTV
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13135.0,196,1,3096.00,606816.00
16754.0,371,2,2002.40,371445.20
15749.0,234,15,21535.90,335960.04
13270.0,365,1,590.00,215350.00
17846.0,84,1,2033.10,170780.40
...,...,...,...,...
13154.0,143,1,-611.86,-87495.98
14213.0,371,5,-1192.20,-88461.24
15369.0,143,1,-1592.49,-227726.07
12503.0,337,1,-1126.00,-379462.00


In [4]:
X = rfm_df[['Recency', 'Frequency', 'Monetary']]
y = rfm_df['CLTV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
}


In [5]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred) 
    print(f"{model_name} MSE: {mse}, R-squared: {r2}")

Linear Regression MSE: 553909593.2365263, R-squared: 0.008482673171418487
Random Forest MSE: 265975620.01047832, R-squared: 0.5238944423882679


In [6]:
rf=RandomForestRegressor()

In [7]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred) 
print(f" MSE: {mse}, R-squared: {r2}")

 MSE: 205141326.62601426, R-squared: 0.6327899312777567


In [8]:
def predict():
    dummy_df = pd.DataFrame(columns=['Recency', 'Frequency', 'Monetary'])
    Recency=eval(input('please enter recency : '))
    Frequency=eval(input('please enter Frequency : '))
    Monetary=eval(input('please enter Monetary : '))
    newob=[[Recency,Frequency,Monetary]]
    dummy_df = dummy_df.append(pd.DataFrame(newob, columns=['Recency', 'Frequency', 'Monetary']), ignore_index=True)
    new_pred=rf.predict(dummy_df)[0]
    return new_pred

In [9]:
predict()

please enter recency : 926
please enter Frequency : 2
please enter Monetary : 606816.00


334073.1818749996