In [None]:
# Create synthetic incentive data
incentive_types = [
    'discount_10', 'discount_20', 'free_shipping',
    'bundle_deal', 'loyalty_points', 'flash_sale',
    'gift_card'
]
y_incentive = np.random.choice(incentive_types, size=len(X_train))

# Encode incentive types
label_encoder = LabelEncoder()
y_incentive_encoded = label_encoder.fit_transform(y_incentive)

# Train XGBoost model
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
xgb_model.fit(X_train, y_incentive_encoded)

# Save model and encoder
with open('../models/incentive_recommender.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
    
with open('../models/incentive_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Analyze feature importance
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

fig = px.bar(importance_df,
             x='importance',
             y='feature',
             title='Feature Importance for Incentive Recommendation',
             orientation='h')
fig.show()

## 3. Incentive Recommendation Model

In [None]:
# Determine optimal number of clusters
inertias = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
fig = px.line(x=K, y=inertias,
              title='Elbow Method for Optimal k',
              labels={'x': 'k', 'y': 'Inertia'})
fig.show()

# Train KMeans model with optimal k=5
kmeans_model = KMeans(n_clusters=5, random_state=42)
kmeans_model.fit(X_train)

# Save model
with open('../models/kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans_model, f)

# Generate and save persona profiles
cluster_centers = pd.DataFrame(
    kmeans_model.cluster_centers_,
    columns=X_train.columns
)

persona_profiles = {
    i: {
        'center': center,
        'size': sum(kmeans_model.labels_ == i),
        'description': f"Persona {i+1}"
    }
    for i, center in enumerate(cluster_centers.values)
}

with open('../models/persona_profiles.pkl', 'wb') as f:
    pickle.dump(persona_profiles, f)

## 2. Customer Segmentation Model

In [None]:
# Load preprocessed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

# Train CatBoost model
catboost_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    verbose=100
)
catboost_model.fit(X_train, y_train)

# Evaluate model
y_pred = catboost_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate SHAP values
explainer = shap.TreeExplainer(catboost_model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary
shap.summary_plot(shap_values, X_test)

# Save model
with open('../models/purchase_predictor_catboost.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)

## 1. Purchase Prediction Model

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import pickle
import shap
import plotly.express as px
import plotly.graph_objects as go

# Set random seed
np.random.seed(42)

# 🤖 Model Development - Online Shopper Intention

This notebook focuses on developing three machine learning models:
1. Purchase Prediction (CatBoost Classifier)
2. Customer Segmentation (KMeans Clustering)
3. Incentive Recommendation (XGBoost Multi-class)