In [8]:
!pip install lifetimes
import pandas as pd
import numpy as np
from lifetimes import BetaGeoFitter, GammaGammaFitter
from sklearn.preprocessing import LabelEncoder
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx

# Load the data
df = pd.read_excel("Online Retail.xlsx")

# ------------------ 1. Data Preprocessing ------------------
# Remove missing values
df.dropna(subset=['CustomerID'], inplace=True)

# Remove cancelled orders
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Convert types
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(int)

# Calculate total price
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# ------------------ 2. Aggregate per customer ------------------
from lifetimes.utils import summary_data_from_transaction_data

summary = summary_data_from_transaction_data(df, 'CustomerID', 'InvoiceDate',
                                             monetary_value_col='TotalPrice', observation_period_end=None)

# Filter out customers with negative or 0 monetary value
summary = summary[summary['monetary_value'] > 0]

# ------------------ 3. Fit BG/NBD model ------------------
bgf = BetaGeoFitter(penalizer_coef=0.01)
bgf.fit(summary['frequency'], summary['recency'], summary['T'])

# Predict expected purchases in next 3 months (90 days)
summary['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(90,
                                summary['frequency'], summary['recency'], summary['T'])

# ------------------ 4. Fit Gamma-Gamma Model ------------------
ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(summary['frequency'], summary['monetary_value'])

# Predict CLV
summary['predicted_clv'] = ggf.customer_lifetime_value(
    bgf, summary['frequency'], summary['recency'], summary['T'],
    summary['monetary_value'], time=3, discount_rate=0.01)

# ------------------ 5. Merge with Demographics ------------------
# Average country or encode if needed
customer_info = df.groupby('CustomerID').agg({
    'Country': 'first'
}).reset_index()

# Label encode country
le = LabelEncoder()
customer_info['Country_encoded'] = le.fit_transform(customer_info['Country'])

# Merge
final_df = pd.merge(summary.reset_index(), customer_info, on='CustomerID')

# ------------------ 6. Optional: Predict CLV with Regression ------------------
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = final_df[['frequency', 'recency', 'T', 'monetary_value', 'Country_encoded']]
y = final_df['predicted_clv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
print("R² on Test Data:", reg.score(X_test, y_test))

# ------------------ 7. Display Top Customers ------------------
top_customers = final_df[['CustomerID', 'predicted_clv']].sort_values(by='predicted_clv', ascending=False)
print(top_customers.head(10))

--2025-07-21 13:16:22--  https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘Online Retail.xlsx.1’

Online Retail.xlsx.     [    <=>             ]  22.62M  30.1MB/s    in 0.8s    

2025-07-21 13:16:23 (30.1 MB/s) - ‘Online Retail.xlsx.1’ saved [23715344]

R² on Test Data: 0.42383332710411603
      CustomerID  predicted_clv
1938       16446  151600.600013
1111       14646   57894.322528
2706       18102   48326.682010
2413       17450   39696.352160
834        14096   33424.172684
1243       14911   27661.620714
34         12415   27239.889080
864        14156   23338.986562
2441       17511   18120.798990
1745       16029   15342.399734
