In [102]:
# Customer Purchase Prediction - AI/ML Engineer Assignment
# Predicting whether a customer will make a purchase in the next 30 days

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, roc_curve
import shap

# Set random seed
np.random.seed(42)

print("="*60)
print("CUSTOMER PURCHASE PREDICTION ANALYSIS")
print("="*60)

CUSTOMER PURCHASE PREDICTION ANALYSIS


In [103]:
# 1. Data Loading and Exploration

print("\n1. LOADING AND EXPLORING DATA")
print("-" * 40)

# Load data
df = pd.read_csv('customer_data.csv')
print(f"Dataset shape: {df.shape}\n")

# Preview
print("First 5 rows:")
print(df.head(), "\n")

# Info
print("Dataset Info:")
print(df.info(), "\n")

# Missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)
print("Missing Values:")
print(missing_df[missing_df['Missing Count'] > 0], "\n")

# Target distribution
target_dist = df['is_purchased_next_30'].value_counts()
target_percent = df['is_purchased_next_30'].value_counts(normalize=True) * 100
print("Target Variable Distribution:")
print(f"Class 0 (No Purchase): {target_dist[0]} ({target_percent[0]:.1f}%)")
print(f"Class 1 (Purchase): {target_dist[1]} ({target_percent[1]:.1f}%)\n")

# Numerical summary
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical Features Summary:")
print(df[num_cols].describe())



1. LOADING AND EXPLORING DATA
----------------------------------------
Dataset shape: (5000, 13)

First 5 rows:
  customer_id signup_date last_activity_date last_purchase_date  total_spent  \
0     C100000  2023-08-02         2024-10-12         2023-10-27       397.81   
1     C100001  2020-08-16         2020-11-13                NaN         0.00   
2     C100002  2020-02-21         2020-12-18         2020-08-28        33.25   
3     C100003  2024-02-27         2026-06-19         2025-09-15        77.56   
4     C100004  2021-07-17         2023-05-25                NaN         0.00   

   avg_cart_value  num_visits_30d  num_purchases_90d  \
0           22.76               0                  0   
1            0.00               0                  0   
2           51.38               1                  1   
3           32.41               1                  2   
4            0.00               2                  0   

   days_since_last_purchase preferred_device     city    country  \
0

In [104]:
# 2. Data Preprocessing and Feature Engineering


def preprocess_data(df):
    print("\n2. DATA PREPROCESSING AND FEATURE ENGINEERING")
    print("-" * 50)

    df_processed = df.copy()

    # Parse date columns
    for col in ['signup_date', 'last_activity_date', 'last_purchase_date']:
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')

    # Customers who purchased before
    df_processed['has_purchased_before'] = df_processed['last_purchase_date'].notna().astype(int)

    # Reference date for time-based features
    reference_date = pd.concat([df_processed[c] for c in ['signup_date', 'last_activity_date', 'last_purchase_date']]).max() + pd.Timedelta(days=1)
    df_processed['days_since_signup'] = (reference_date - df_processed['signup_date']).dt.days
    df_processed['days_since_last_activity'] = (reference_date - df_processed['last_activity_date']).dt.days

    # Handle last purchase for customers who never purchased
    df_processed['days_since_last_purchase_filled'] = df_processed['days_since_last_purchase'].fillna(df_processed['days_since_signup'])

    # RFM features
    df_processed['recency'] = df_processed['days_since_last_purchase_filled']
    df_processed['frequency'] = df_processed['num_purchases_90d']
    df_processed['monetary'] = df_processed['total_spent']

    df_processed['recency_score'] = pd.qcut(df_processed['recency'].rank(method='first'), 5, labels=[5,4,3,2,1]).astype(int)
    df_processed['frequency_score'] = pd.qcut(df_processed['frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)
    df_processed['monetary_score'] = pd.qcut(df_processed['monetary'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)
    df_processed['rfm_score'] = (df_processed['recency_score'].astype(str) +
                                 df_processed['frequency_score'].astype(str) +
                                 df_processed['monetary_score'].astype(str)).astype(int)

    # Customer engagement features
    df_processed['customer_age_days'] = df_processed['days_since_signup']
    df_processed['activity_recency'] = df_processed['days_since_last_activity']
    df_processed['visits_per_day'] = df_processed['num_visits_30d'] / 30
    df_processed['purchase_frequency'] = df_processed['num_purchases_90d'] / 90
    df_processed['avg_days_between_purchases'] = np.where(
        df_processed['num_purchases_90d'] > 0,
        90 / df_processed['num_purchases_90d'],
        df_processed['days_since_signup']
    )

    # Customer segments
    df_processed['is_high_value'] = (df_processed['total_spent'] > df_processed['total_spent'].quantile(0.8)).astype(int)
    df_processed['is_frequent_visitor'] = (df_processed['num_visits_30d'] > df_processed['num_visits_30d'].quantile(0.7)).astype(int)
    df_processed['is_recent_purchaser'] = (df_processed['days_since_last_purchase_filled'] < 30).astype(int)

    # Encode categorical variables
    for col in ['preferred_device', 'city', 'country']:
        df_processed[f'{col}_encoded'] = LabelEncoder().fit_transform(df_processed[col])

    # Device dummies
    df_processed = pd.concat([df_processed, pd.get_dummies(df_processed['preferred_device'], prefix='device')], axis=1)

    print(f"Processed dataset shape: {df_processed.shape}")
    print("New features created successfully!")

    return df_processed



In [105]:
# Execute Preprocessing
df_processed = preprocess_data(df)


2. DATA PREPROCESSING AND FEATURE ENGINEERING
--------------------------------------------------
Processed dataset shape: (5000, 38)
New features created successfully!


In [106]:
# 3. Feature Selection and Preparation

def prepare_features(df_processed):
    """Select and prepare features for modeling"""
    print("\n3. FEATURE SELECTION AND PREPARATION")
    print("-" * 40)

    # feature columns
    feature_columns = [

        'total_spent', 'avg_cart_value', 'num_visits_30d', 'num_purchases_90d',
        'days_since_last_purchase_filled', 'days_since_signup', 'days_since_last_activity',
        'recency_score', 'frequency_score', 'monetary_score',
        'customer_age_days', 'activity_recency', 'visits_per_day', 'purchase_frequency', 'avg_days_between_purchases',
        'has_purchased_before', 'is_high_value', 'is_frequent_visitor', 'is_recent_purchaser',
        'preferred_device_encoded', 'city_encoded',
        'device_desktop', 'device_mobile', 'device_tablet'
    ]


    X = df_processed[feature_columns].copy()
    y = df_processed['is_purchased_next_30'].copy()

    # Handle infinite values
    X = X.replace([np.inf, -np.inf], np.nan)

    # Fill remaining missing values
    X = X.fillna(X.median())

    print(f"Feature matrix shape: {X.shape}")
    print(f"Target vector shape: {y.shape}")
    print(f"Features selected: {len(feature_columns)}")

    return X, y, feature_columns

# Prepare features
X, y, feature_names = prepare_features(df_processed)


3. FEATURE SELECTION AND PREPARATION
----------------------------------------
Feature matrix shape: (5000, 24)
Target vector shape: (5000,)
Features selected: 24


In [107]:
# 4. Model Training and Evaluation

def train_models_cv(X, y, feature_names):

    print("\nMODEL TRAINING AND EVALUATION (WITH CV & HYPERPARAMETER TUNING)")
    print("-" * 60)

    # Split data (for final test evaluation)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    # Scale features for Logistic Regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define models and hyperparameter grids
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(random_state=42, max_iter=1000),
            'params': {'C': [0.01, 0.1, 1, 10]}
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]}
        }
    }

    results = {}
    trained_models = {}

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, m in models.items():
        print(f"\nTuning and training {name}...")
        if name == 'Logistic Regression':
            grid = GridSearchCV(m['model'], m['params'], cv=cv, scoring='roc_auc')
            grid.fit(X_train_scaled, y_train)
            best_model = grid.best_estimator_
            y_pred = best_model.predict(X_test_scaled)
            y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
            trained_models[name] = {'model': best_model, 'scaler': scaler}
        else:
            grid = GridSearchCV(m['model'], m['params'], cv=cv, scoring='roc_auc')
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:, 1]
            trained_models[name] = {'model': best_model, 'scaler': None}

        # Calculate metrics
        results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba)
        }

        print(f"Best hyperparameters: {grid.best_params_}")

    # Comparison
    results_df = pd.DataFrame(results).T
    print("\nMODEL COMPARISON:")
    print(results_df.round(4))

    # --- Save model comparison results ---
    results_df.to_csv("model_comparison.csv")
    print("Model comparison saved to 'model_comparison.csv'")

    best_model_name = results_df['roc_auc'].idxmax()
    print(f"\nBest model based on ROC-AUC: {best_model_name}")

    return trained_models, results_df, best_model_name, X_test, y_test, scaler


In [108]:
# Train and evaluate models
trained_models, results_df, best_model_name, X_test, y_test, scaler = train_models_cv(X, y, feature_names)


MODEL TRAINING AND EVALUATION (WITH CV & HYPERPARAMETER TUNING)
------------------------------------------------------------
Training set size: 4000, Test set size: 1000

Tuning and training Logistic Regression...
Best hyperparameters: {'C': 0.01}

Tuning and training Random Forest...
Best hyperparameters: {'max_depth': 5, 'n_estimators': 200}

MODEL COMPARISON:
                     accuracy  precision  recall      f1  roc_auc
Logistic Regression     0.817     0.6667  0.0532  0.0985   0.7021
Random Forest           0.812     0.5000  0.0053  0.0105   0.6921
Model comparison saved to 'model_comparison.csv'

Best model based on ROC-AUC: Logistic Regression


In [109]:
# 5. Model Interpretability
def analyze_feature_importance(trained_models, best_model_name, X_test, feature_names):

    print("\n5. MODEL INTERPRETABILITY")
    print("-" * 30)

    best_model = trained_models[best_model_name]['model']

    # Get feature importance
    if best_model_name == 'Logistic Regression':
        importance = np.abs(best_model.coef_[0])
    else:
        importance = best_model.feature_importances_

    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)

    print(f"\nTop 10 Most Important Features ({best_model_name}):")
    print(feature_importance.head(10))

    # SHAP analysis (sampled)
    shap_feature_importance = None
    try:
        X_sample = X_test.sample(min(100, len(X_test)), random_state=42)
        explainer = shap.LinearExplainer(best_model, X_sample) if best_model_name == 'Logistic Regression' else shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_sample)
        shap_importance = np.abs(shap_values).mean(0)
        shap_feature_importance = pd.DataFrame({
            'feature': feature_names,
            'shap_importance': shap_importance
        }).sort_values('shap_importance', ascending=False)

        print("\nTop 10 Features by SHAP Importance:")
        print(shap_feature_importance.head(10))

    except Exception as e:
        print(f"SHAP analysis failed: {e}")

    # Key insights with short explanations for top 3 features
    top_features = feature_importance.head(3)['feature'].tolist()
    print("\nKEY INSIGHTS:")
    for i, feature in enumerate(top_features, 1):
        fname = feature.lower()
        if 'recency' in fname or 'days_since' in fname:
            explanation = "Recent activity or purchase is predictive of buying again."
        elif 'frequency' in fname or 'purchases' in fname:
            explanation = "Frequent purchasers are more likely to buy again."
        elif 'monetary' in fname or 'spent' in fname or 'total' in fname:
            explanation = "Higher spending indicates a valuable customer."
        elif 'visit' in fname:
            explanation = "More visits correlate with higher purchase probability."
        else:
            explanation = "This feature influences purchase behavior."
        print(f"{i}. Feature '{feature}': {explanation}")

    return feature_importance, shap_feature_importance

# Call the function
feature_importance, shap_importance = analyze_feature_importance(
    trained_models, best_model_name, X_test, feature_names
)

# Save feature importance to CSV
feature_importance.to_csv("feature_importance.csv", index=False)
print("Feature importance saved to 'feature_importance.csv'")




5. MODEL INTERPRETABILITY
------------------------------

Top 10 Most Important Features (Logistic Regression):
                            feature  importance
14       avg_days_between_purchases    0.249272
12                   visits_per_day    0.176494
2                    num_visits_30d    0.176494
8                   frequency_score    0.141496
18              is_recent_purchaser    0.110562
1                    avg_cart_value    0.090042
4   days_since_last_purchase_filled    0.068729
3                 num_purchases_90d    0.065333
13               purchase_frequency    0.065333
15             has_purchased_before    0.054067

Top 10 Features by SHAP Importance:
                            feature shap_importance
14       avg_days_between_purchases      209.316401
4   days_since_last_purchase_filled       31.141613
6          days_since_last_activity       10.832151
11                 activity_recency       10.832151
5                 days_since_signup        4.872259
10        