# Project Title - Customer Lifetime Value Prediction

In [60]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
# Open the datafile
data=pd.read_csv("D:\\Data_Science\\2nd_sem\\Internsip\\Dataset\\customer_segmentation.csv",encoding='latin1')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [62]:
data.shape

(541909, 8)

In [63]:
data['TotalSpent'] = data['Quantity'] * data['UnitPrice']  # Total purchase per invoice
data['AvgOrderValue'] = data.groupby('CustomerID')['TotalSpent'].transform('mean')  # Average order value per customer
data['NumTransactions'] = data.groupby('CustomerID')['InvoiceNo'].transform('count')  # Number of transactions per customer

In [64]:
data['CLV'] = data.groupby('CustomerID')['TotalSpent'].transform('sum') # CLV calculation

# Separate features and target variable

In [65]:
X = data[['TotalSpent', 'AvgOrderValue', 'NumTransactions']]  # Features
y = data['CLV']  # Target variable

In [66]:
X

Unnamed: 0,TotalSpent,AvgOrderValue,NumTransactions
0,15.30,16.950737,312.0
1,20.34,16.950737,312.0
2,22.00,16.950737,312.0
3,20.34,16.950737,312.0
4,20.34,16.950737,312.0
...,...,...,...
541904,10.20,16.592500,52.0
541905,12.60,16.592500,52.0
541906,16.60,16.592500,52.0
541907,16.60,16.592500,52.0


In [67]:
y

0         5288.63
1         5288.63
2         5288.63
3         5288.63
4         5288.63
           ...   
541904     862.81
541905     862.81
541906     862.81
541907     862.81
541908     862.81
Name: CLV, Length: 541909, dtype: float64

# Train-Test Split

In [68]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Replace NA values

In [69]:
from sklearn.impute import SimpleImputer

# Create an imputer to replace NaNs with the mean value
imputer = SimpleImputer(strategy='mean')

# Impute missing values in X_train and X_test
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [70]:
imputer = SimpleImputer(strategy='median')
y_train_imputed = imputer.fit_transform(y_train.values.reshape(-1, 1))
y_test_imputed = imputer.fit_transform(y_test.values.reshape(-1, 1))

# Choose and train a regression model

In [71]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()  
# Use the imputed data for training your model
model.fit(X_train_imputed, y_train_imputed)

# Predicticting the test result

In [72]:
predicted_clv = model.predict(X_test_imputed)
predicted_clv

array([[ 3057.70793619],
       [ 5714.7925988 ],
       [ 2157.76990788],
       ...,
       [ 5957.66407043],
       [ 8964.30756876],
       [-2175.25184325]])

# Evaluate the model

In [73]:
from sklearn.metrics import mean_squared_error, r2_score
# Evaluating the model
mse = mean_squared_error(y_test_imputed, predicted_clv)
r2 = r2_score(y_test_imputed, predicted_clv)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 359984599.96259636
R^2 Score: 0.4852042136815665
