# Predicting Bank Campaign Success

**Objective:** Build a binary classification model to predict whether a customer will subscribe to a term deposit (`y`) based on demographic, financial, and campaign-related features.

**Dataset:** Bank Marketing Campaign (semicolon-separated CSV)

**Target Variable:** `y` (yes/no) — Term deposit subscription

## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report, ConfusionMatrixDisplay
)

# Settings
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ All libraries imported successfully!")

## 2. Data Loading & Cleaning

In [None]:
# Load dataset (semicolon-separated)
df = pd.read_csv('bank-additional-full.csv', sep=';')

# Display basic info
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Check data types and missing values
print("Data Info:")
df.info()

print("\n" + "="*60)
print("Missing Values:")
print(df.isnull().sum())

print("\n" + "="*60)
print("Duplicate Rows:")
print(f"Number of duplicates: {df.duplicated().sum()}")

In [None]:
# Handle duplicates if any
df_clean = df.drop_duplicates()
print(f"Shape after removing duplicates: {df_clean.shape}")

# Check for 'unknown' values in categorical columns
categorical_cols = df_clean.select_dtypes(include='object').columns
print("\n'Unknown' values count per column:")
for col in categorical_cols:
    unknown_count = (df_clean[col] == 'unknown').sum()
    if unknown_count > 0:
        print(f"{col}: {unknown_count}")

In [None]:
# Statistical summary
df_clean.describe()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Target Variable Distribution

In [None]:
# Pie chart for target variable
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
target_counts = df_clean['y'].value_counts()
axes[0].pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', 
            startangle=90, colors=['#ff9999', '#66b3ff'])
axes[0].set_title('Target Variable Distribution (y)', fontsize=14, fontweight='bold')

# Count plot
sns.countplot(data=df_clean, x='y', palette='Set2', ax=axes[1])
axes[1].set_title('Target Variable Count', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Subscribed to Term Deposit')
axes[1].set_ylabel('Count')

# Add count labels
for container in axes[1].containers:
    axes[1].bar_label(container)

plt.tight_layout()
plt.show()

print(f"\nClass Distribution:")
print(df_clean['y'].value_counts())
print(f"\nClass Balance Ratio: {df_clean['y'].value_counts(normalize=True)}")

### 3.2 Categorical Features Distribution

In [None]:
# Count plots for key categorical variables
cat_features = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week']

fig, axes = plt.subplots(3, 2, figsize=(16, 14))
axes = axes.ravel()

for idx, col in enumerate(cat_features):
    order = df_clean[col].value_counts().index
    sns.countplot(data=df_clean, y=col, order=order, palette='viridis', ax=axes[idx])
    axes[idx].set_title(f'Distribution of {col.upper()}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Count')
    axes[idx].set_ylabel(col.capitalize())

plt.tight_layout()
plt.show()

### 3.3 Numerical Features Distribution

In [None]:
# Histogram and KDE for age and duration
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Age distribution
axes[0, 0].hist(df_clean['age'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Age Distribution (Histogram)', fontweight='bold')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Age KDE
df_clean['age'].plot(kind='kde', ax=axes[0, 1], color='blue', linewidth=2)
axes[0, 1].set_title('Age Distribution (KDE)', fontweight='bold')
axes[0, 1].set_xlabel('Age')

# Duration distribution
axes[1, 0].hist(df_clean['duration'], bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Call Duration Distribution (Histogram)', fontweight='bold')
axes[1, 0].set_xlabel('Duration (seconds)')
axes[1, 0].set_ylabel('Frequency')

# Duration KDE
df_clean['duration'].plot(kind='kde', ax=axes[1, 1], color='red', linewidth=2)
axes[1, 1].set_title('Call Duration Distribution (KDE)', fontweight='bold')
axes[1, 1].set_xlabel('Duration (seconds)')

plt.tight_layout()
plt.show()

### 3.4 Outlier Detection with Box Plots

In [None]:
# Box plots for numerical features to detect outliers
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 
                    'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

fig, axes = plt.subplots(5, 2, figsize=(14, 18))
axes = axes.ravel()

for idx, col in enumerate(numeric_features):
    sns.boxplot(data=df_clean, y=col, color='lightgreen', ax=axes[idx])
    axes[idx].set_title(f'Box Plot: {col.upper()}', fontweight='bold')
    axes[idx].set_ylabel(col)

plt.tight_layout()
plt.show()

### 3.5 Correlation Heatmap

In [None]:
# Correlation matrix for numerical variables
# First, encode target variable for correlation analysis
df_corr = df_clean.copy()
df_corr['y_encoded'] = df_corr['y'].map({'yes': 1, 'no': 0})

# Select numeric columns + encoded target
numeric_cols = df_corr.select_dtypes(include=[np.number]).columns
corr_matrix = df_corr[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            cbar_kws={'label': 'Correlation Coefficient'},
            linewidths=0.5, square=True)
plt.title('Correlation Heatmap of Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Show top correlations with target
print("\nTop correlations with target variable (y):")
target_corr = corr_matrix['y_encoded'].sort_values(ascending=False)
print(target_corr)

### 3.6 Pair Plot (Key Features)

In [None]:
# Pair plot for selected key numerical features
key_features = ['age', 'duration', 'campaign', 'euribor3m', 'y']
sample_df = df_clean[key_features].sample(n=min(1000, len(df_clean)), random_state=42)

sns.pairplot(sample_df, hue='y', palette='Set1', diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot: Key Numerical Features', y=1.02, fontsize=16, fontweight='bold')
plt.show()

### 3.7 Subscription Rate by Categorical Features

In [None]:
# Grouped bar charts: subscription rate vs categorical features
categorical_features = ['job', 'marital', 'education', 'contact']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    # Create cross-tabulation
    ct = pd.crosstab(df_clean[col], df_clean['y'], normalize='index') * 100
    ct.plot(kind='bar', ax=axes[idx], color=['#ff9999', '#66b3ff'], width=0.8)
    axes[idx].set_title(f'Subscription Rate by {col.upper()}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Percentage (%)')
    axes[idx].legend(title='Subscribed', labels=['No', 'Yes'])
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Feature Engineering

### 4.1 Encode Target Variable

In [None]:
# Encode target variable: yes=1, no=0
df_clean['y'] = df_clean['y'].map({'yes': 1, 'no': 0})
print("Target variable encoded:")
print(df_clean['y'].value_counts())

### 4.2 Separate Features and Target

In [None]:
# Separate features (X) and target (y)
X = df_clean.drop('y', axis=1)
y = df_clean['y']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

### 4.3 Identify Categorical and Numerical Columns

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")

### 4.4 Create Preprocessing Pipeline

In [None]:
# Create preprocessing pipeline
# Numerical: StandardScaler
# Categorical: OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ])

print("✅ Preprocessing pipeline created!")
print("   - Numerical features: StandardScaler")
print("   - Categorical features: OneHotEncoder (drop_first=True)")

### 4.5 Train-Test Split

In [None]:
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"\nTarget distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"\nTarget distribution in testing set:\n{y_test.value_counts(normalize=True)}")

## 5. Model Building

### 5.1 Define Models

In [None]:
# Define models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
}

print("Models to train:")
for model_name in models.keys():
    print(f"  - {model_name}")

### 5.2 Train Models and Store Results

In [None]:
# Train all models and collect metrics
results = []
trained_models = {}

for model_name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print('='*60)
    
    # Create pipeline: preprocessor + model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    })
    
    # Store trained model
    trained_models[model_name] = {
        'pipeline': pipeline,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    # Print classification report
    print(f"\n{model_name} - Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

print("\n" + "="*60)
print("✅ All models trained successfully!")
print("="*60)

### 5.3 Compare Model Performance

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('ROC-AUC', ascending=False).reset_index(drop=True)

print("\n📊 Model Performance Comparison:")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']

for idx, metric in enumerate(metrics):
    ax = axes[idx]
    results_df_sorted = results_df.sort_values(metric, ascending=True)
    bars = ax.barh(results_df_sorted['Model'], results_df_sorted[metric], color='steelblue')
    ax.set_xlabel(metric, fontweight='bold')
    ax.set_title(f'{metric} Comparison', fontweight='bold', fontsize=12)
    ax.set_xlim([0, 1])
    
    # Add value labels
    for bar in bars:
        width = bar.get_width()
        ax.text(width, bar.get_y() + bar.get_height()/2, 
                f'{width:.3f}', ha='left', va='center', fontsize=9)

# Remove the extra subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

## 6. Model Evaluation Visualizations

### 6.1 Confusion Matrices

In [None]:
# Confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for idx, (model_name, model_data) in enumerate(trained_models.items()):
    cm = confusion_matrix(y_test, model_data['y_pred'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f'{model_name}\nConfusion Matrix', fontweight='bold', fontsize=12)
    axes[idx].grid(False)

plt.tight_layout()
plt.show()

### 6.2 ROC Curves

In [None]:
# ROC curves for all models
plt.figure(figsize=(10, 8))

colors = ['blue', 'green', 'red', 'purple']

for idx, (model_name, model_data) in enumerate(trained_models.items()):
    fpr, tpr, _ = roc_curve(y_test, model_data['y_pred_proba'])
    roc_auc = roc_auc_score(y_test, model_data['y_pred_proba'])
    
    plt.plot(fpr, tpr, color=colors[idx], lw=2, 
             label=f'{model_name} (AUC = {roc_auc:.3f})')

# Plot diagonal line (random classifier)
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 6.3 Feature Importance (Random Forest & XGBoost)

In [None]:
# Get feature names after preprocessing
def get_feature_names(preprocessor, X):
    # Get feature names from ColumnTransformer
    feature_names = []
    
    # Numerical features
    num_features = preprocessor.transformers_[0][2]
    feature_names.extend(num_features)
    
    # Categorical features (one-hot encoded)
    cat_transformer = preprocessor.transformers_[1][1]
    cat_features = preprocessor.transformers_[1][2]
    cat_feature_names = cat_transformer.get_feature_names_out(cat_features)
    feature_names.extend(cat_feature_names)
    
    return feature_names

# Get feature names
feature_names = get_feature_names(preprocessor, X_train)
print(f"Total features after encoding: {len(feature_names)}")

In [None]:
# Feature importance for Random Forest
rf_model = trained_models['Random Forest']['pipeline'].named_steps['classifier']
rf_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(data=rf_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Top 15 Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score', fontweight='bold')
plt.ylabel('Feature', fontweight='bold')
plt.tight_layout()
plt.show()

print("\n📊 Top 15 Important Features (Random Forest):")
print(rf_importance.to_string(index=False))

In [None]:
# Feature importance for XGBoost
xgb_model = trained_models['XGBoost']['pipeline'].named_steps['classifier']
xgb_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(data=xgb_importance, x='Importance', y='Feature', palette='plasma')
plt.title('Top 15 Feature Importance - XGBoost', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score', fontweight='bold')
plt.ylabel('Feature', fontweight='bold')
plt.tight_layout()
plt.show()

print("\n📊 Top 15 Important Features (XGBoost):")
print(xgb_importance.to_string(index=False))

## 7. Insights & Conclusions

### 7.1 Key Insights from Analysis

In [None]:
# Summary of findings
print("="*80)
print("🔍 KEY INSIGHTS FROM BANK MARKETING CAMPAIGN ANALYSIS")
print("="*80)

print("\n1️⃣ DATA OVERVIEW:")
print(f"   - Total Records: {len(df_clean):,}")
print(f"   - Features: {X.shape[1]}")
print(f"   - Class Imbalance: {(y==0).sum():,} No vs {(y==1).sum():,} Yes")
print(f"   - Subscription Rate: {(y==1).sum()/len(y)*100:.2f}%")

print("\n2️⃣ BEST PERFORMING MODEL:")
best_model = results_df.iloc[0]
print(f"   - Model: {best_model['Model']}")
print(f"   - ROC-AUC: {best_model['ROC-AUC']:.4f}")
print(f"   - Accuracy: {best_model['Accuracy']:.4f}")
print(f"   - F1-Score: {best_model['F1-Score']:.4f}")

print("\n3️⃣ TOP PREDICTIVE FEATURES (based on Random Forest):")
top_features = rf_importance.head(5)['Feature'].tolist()
for i, feat in enumerate(top_features, 1):
    print(f"   {i}. {feat}")

print("\n4️⃣ BUSINESS RECOMMENDATIONS:")
print("   ✅ Focus on call duration - longer conversations increase subscription likelihood")
print("   ✅ Target customers based on economic indicators (euribor3m, emp.var.rate)")
print("   ✅ Prioritize cellular contact method over telephone")
print("   ✅ Optimize campaign timing - certain months show higher success rates")
print("   ✅ Consider customer demographics (age, job, education) for targeted campaigns")

print("\n5️⃣ MODEL DEPLOYMENT STRATEGY:")
print(f"   - Deploy {best_model['Model']} for production predictions")
print("   - Use probability scores to prioritize high-likelihood customers")
print("   - Implement A/B testing to validate model performance in real campaigns")
print("   - Regular model retraining with new campaign data")

print("\n" + "="*80)

### 7.2 Marketing Strategy Recommendations

In [None]:
# Analyze high-likelihood customer segments
print("📈 CUSTOMER SEGMENTATION FOR TARGETED MARKETING")
print("="*80)

# Use best model to get probabilities for all data
best_model_name = results_df.iloc[0]['Model']
best_pipeline = trained_models[best_model_name]['pipeline']

# Get predictions for entire dataset
all_probas = best_pipeline.predict_proba(X)[:, 1]
df_with_scores = df_clean.copy()
df_with_scores['subscription_probability'] = all_probas

# Segment customers by probability
df_with_scores['segment'] = pd.cut(df_with_scores['subscription_probability'], 
                                     bins=[0, 0.3, 0.6, 1.0],
                                     labels=['Low', 'Medium', 'High'])

print("\nCustomer Segments by Subscription Probability:")
print(df_with_scores['segment'].value_counts().sort_index())

print("\n💡 RECOMMENDED ACTIONS BY SEGMENT:")
print("\n🔴 HIGH PROBABILITY (>60%):")
high_prob = df_with_scores[df_with_scores['segment'] == 'High']
print(f"   - Count: {len(high_prob):,} customers")
print("   - Action: Priority outreach with personalized offers")
print("   - Expected conversion rate: High (>60%)")

print("\n🟡 MEDIUM PROBABILITY (30-60%):")
med_prob = df_with_scores[df_with_scores['segment'] == 'Medium']
print(f"   - Count: {len(med_prob):,} customers")
print("   - Action: Targeted campaigns with incentives")
print("   - Expected conversion rate: Moderate (30-60%)")

print("\n🟢 LOW PROBABILITY (<30%):")
low_prob = df_with_scores[df_with_scores['segment'] == 'Low']
print(f"   - Count: {len(low_prob):,} customers")
print("   - Action: Minimal contact, focus on brand awareness")
print("   - Expected conversion rate: Low (<30%)")

print("\n" + "="*80)

### 7.3 Final Summary

In [None]:
print("\n" + "="*80)
print("✅ PROJECT COMPLETED SUCCESSFULLY!")
print("="*80)

print("\n📋 DELIVERABLES:")
print("   ✓ Comprehensive EDA with 10+ visualizations")
print("   ✓ Feature engineering and preprocessing pipeline")
print("   ✓ 4 trained classification models")
print("   ✓ Model evaluation with confusion matrices and ROC curves")
print("   ✓ Feature importance analysis")
print("   ✓ Business insights and marketing recommendations")
print("   ✓ Customer segmentation strategy")

print("\n🎯 NEXT STEPS:")
print("   1. Deploy best model to production environment")
print("   2. Implement real-time scoring API")
print("   3. Set up monitoring and model drift detection")
print("   4. A/B test model predictions vs. current strategy")
print("   5. Collect feedback and retrain model quarterly")

print("\n" + "="*80)
print("Thank you for using this ML pipeline! 🚀")
print("="*80)