<a href="https://colab.research.google.com/github/SarangWanodei20/learn_python/blob/master/Practice_Final_Exam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Customer Lifetime Value Prediction Model

 Data Preprocessing

In [1]:
import pandas as pd

# Load data
transactions = pd.read_csv('transactions.csv')  # Should include 'customer_id', 'order_date', 'order_amount'

# Convert date to datetime
transactions['order_date'] = pd.to_datetime(transactions['order_date'])

# Basic cleaning
transactions.dropna(inplace=True)


FileNotFoundError: [Errno 2] No such file or directory: 'transactions.csv'

Feature Engineering

In [None]:
# Reference date (e.g., latest transaction date)
reference_date = transactions['order_date'].max()

# Group by customer
features = transactions.groupby('customer_id').agg({
    'order_date': [
        lambda x: (reference_date - x.max()).days,  # Recency
        'count'                                     # Frequency
    ],
    'order_amount': 'mean'                          # AOV
})

features.columns = ['recency', 'frequency', 'aov']
features.reset_index(inplace=True)

# Optional: compute total LTV if known, else use proxy (e.g., frequency * aov)
features['ltv'] = features['frequency'] * features['aov']  # Proxy target


Model Training

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

X = features[['recency', 'frequency', 'aov']]
y = features['ltv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")


Visualization & Segmentation


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize prediction vs actual
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual LTV")
plt.ylabel("Predicted LTV")
plt.title("LTV Prediction Accuracy")

# Customer segmentation
features['predicted_ltv'] = model.predict(X)
features['segment'] = pd.qcut(features['predicted_ltv'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])


Deliverables

Notebook: Include data loading, preprocessing, model training, evaluation, and visualizations

Trained model (pickle or joblib file)

In [None]:
import joblib
joblib.dump(model, 'ltv_model.pkl')


Final CSV with Predictions

In [None]:
features.to_csv('ltv_predictions.csv', index=False)