# Logistic Regression - Bank Marketing Campaign Prediction

**Objective:** Build a Logistic Regression model to predict term deposit subscription.

**Algorithm:** Logistic Regression - A linear model for binary classification that estimates probabilities using the logistic function.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report, ConfusionMatrixDisplay
)

import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
# Load dataset
df = pd.read_csv('../bank-additional-full.csv', sep=';')

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Data info and missing values
print("Data Info:")
print(df.info())

print("\n" + "="*60)
print("Missing Values:")
print(df.isnull().sum())

print("\n" + "="*60)
print("Duplicates:", df.duplicated().sum())

In [None]:
# Remove duplicates
df_clean = df.drop_duplicates()
print(f"Shape after removing duplicates: {df_clean.shape}")

# Statistical summary
df_clean.describe()

## 3. Target Variable Analysis

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
target_counts = df_clean['y'].value_counts()
axes[0].pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', 
            startangle=90, colors=['#ff9999', '#66b3ff'])
axes[0].set_title('Target Distribution', fontsize=14, fontweight='bold')

# Bar chart
sns.countplot(data=df_clean, x='y', palette='Set2', ax=axes[1])
axes[1].set_title('Target Count', fontsize=14, fontweight='bold')
for container in axes[1].containers:
    axes[1].bar_label(container)

plt.tight_layout()
plt.show()

print(f"Class Distribution:\n{df_clean['y'].value_counts()}")
print(f"\nClass Ratio:\n{df_clean['y'].value_counts(normalize=True)}")

## 4. Key Feature Visualizations

In [None]:
# Age and Duration distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df_clean['age'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_title('Age Distribution', fontweight='bold')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')

axes[1].hist(df_clean['duration'], bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_title('Call Duration Distribution', fontweight='bold')
axes[1].set_xlabel('Duration (seconds)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Subscription rate by key features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

categorical_features = ['job', 'marital', 'education', 'contact']

for idx, col in enumerate(categorical_features):
    ct = pd.crosstab(df_clean[col], df_clean['y'], normalize='index') * 100
    ct.plot(kind='bar', ax=axes[idx], color=['#ff9999', '#66b3ff'])
    axes[idx].set_title(f'Subscription Rate by {col.upper()}', fontsize=11, fontweight='bold')
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Percentage (%)')
    axes[idx].legend(title='Subscribed', labels=['No', 'Yes'])
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
df_corr = df_clean.copy()
df_corr['y_encoded'] = df_corr['y'].map({'yes': 1, 'no': 0})

numeric_cols = df_corr.select_dtypes(include=[np.number]).columns
corr_matrix = df_corr[numeric_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            cbar_kws={'label': 'Correlation'}, linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTop correlations with target:")
print(corr_matrix['y_encoded'].sort_values(ascending=False))

## 5. Data Preprocessing

In [None]:
# Encode target
df_clean['y'] = df_clean['y'].map({'yes': 1, 'no': 0})

# Separate features and target
X = df_clean.drop('y', axis=1)
y = df_clean['y']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Identify column types
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ])

print("✅ Preprocessing pipeline created")
print("   - Numerical: StandardScaler")
print("   - Categorical: OneHotEncoder")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"\nTrain target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nTest target distribution:\n{y_test.value_counts(normalize=True)}")

## 6. Model Training - Logistic Regression

In [None]:
# Create and train model
print("🤖 Training Logistic Regression...")

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

model.fit(X_train, y_train)

print("✅ Model trained successfully!")

## 7. Model Evaluation

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("="*60)
print("📊 LOGISTIC REGRESSION PERFORMANCE")
print("="*60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print("="*60)

In [None]:
# Classification Report
print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

In [None]:
# Metrics Visualization
metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
metrics_values = [accuracy, precision, recall, f1, roc_auc]

plt.figure(figsize=(10, 6))
bars = plt.bar(metrics_names, metrics_values, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6'], alpha=0.8)
plt.ylim(0, 1.0)
plt.ylabel('Score', fontsize=12, fontweight='bold')
plt.title('Logistic Regression - Performance Metrics', fontsize=14, fontweight='bold')
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Baseline (0.5)')

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, metrics_values)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
             f'{value:.4f}', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Metrics Summary Table
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Score': [accuracy, precision, recall, f1, roc_auc],
    'Interpretation': [
        'Overall correctness of predictions',
        'Accuracy of positive predictions',
        'Coverage of actual positive cases',
        'Harmonic mean of Precision & Recall',
        'Model discrimination ability'
    ]
})

print("\n📊 DETAILED METRICS SUMMARY")
print("="*80)
print(metrics_df.to_string(index=False))
print("="*80)

In [None]:
# Cross-validation score
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"\n🔄 Cross-Validation ROC-AUC Scores: {cv_scores}")
print(f"Mean CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

## 8. Visualizations

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Logistic Regression - Confusion Matrix', fontsize=14, fontweight='bold')
plt.grid(False)
plt.tight_layout()
plt.show()

print(f"\nConfusion Matrix:")
print(f"True Negatives:  {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives:  {cm[1,1]}")

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'Logistic Regression (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curve - Logistic Regression', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Feature Coefficients
cat_transformer = model.named_steps['preprocessor'].transformers_[1][1]
cat_feature_names = cat_transformer.get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(cat_feature_names)

coefficients = model.named_steps['classifier'].coef_[0]

coef_df = pd.DataFrame({
    'Feature': all_features,
    'Coefficient': coefficients
}).sort_values('Coefficient', key=abs, ascending=False).head(20)

plt.figure(figsize=(10, 8))
colors = ['red' if x < 0 else 'green' for x in coef_df['Coefficient']]
plt.barh(range(len(coef_df)), coef_df['Coefficient'], color=colors, alpha=0.7)
plt.yticks(range(len(coef_df)), coef_df['Feature'])
plt.xlabel('Coefficient Value', fontsize=12, fontweight='bold')
plt.title('Top 20 Feature Coefficients', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 10 Positive Coefficients (Increase subscription probability):")
print(coef_df[coef_df['Coefficient'] > 0].head(10))
print("\nTop 10 Negative Coefficients (Decrease subscription probability):")
print(coef_df[coef_df['Coefficient'] < 0].head(10))

## 9. Prediction Example

In [None]:
# Create sample customer data
sample_customer = pd.DataFrame({
    'age': [35],
    'job': ['admin.'],
    'marital': ['married'],
    'education': ['university.degree'],
    'default': ['no'],
    'housing': ['yes'],
    'loan': ['no'],
    'contact': ['cellular'],
    'month': ['may'],
    'day_of_week': ['mon'],
    'duration': [300],
    'campaign': [2],
    'pdays': [999],
    'previous': [0],
    'poutcome': ['nonexistent'],
    'emp.var.rate': [1.1],
    'cons.price.idx': [93.994],
    'cons.conf.idx': [-36.4],
    'euribor3m': [4.857],
    'nr.employed': [5191.0]
})

print("📋 Sample Customer Profile:")
print("="*60)
for col, val in sample_customer.iloc[0].items():
    print(f"{col:20s}: {val}")
print("="*60)

In [None]:
# Make prediction
prediction = model.predict(sample_customer)[0]
probability = model.predict_proba(sample_customer)[0]

print("\n🔮 PREDICTION RESULTS")
print("="*60)
print(f"Prediction: {'✅ YES - Will Subscribe' if prediction == 1 else '❌ NO - Will Not Subscribe'}")
print(f"\nProbabilities:")
print(f"  No (0):  {probability[0]:.4f} ({probability[0]*100:.2f}%)")
print(f"  Yes (1): {probability[1]:.4f} ({probability[1]*100:.2f}%)")
print("="*60)

# Visualize probability
fig, ax = plt.subplots(figsize=(8, 5))
colors = ['#ff9999', '#66b3ff']
bars = ax.bar(['No', 'Yes'], probability, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Probability', fontweight='bold', fontsize=12)
ax.set_title('Subscription Probability for Sample Customer', fontweight='bold', fontsize=14)
ax.set_ylim([0, 1])
ax.axhline(y=0.5, color='red', linestyle='--', linewidth=1, label='Decision Threshold')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}\n({height*100:.2f}%)',
            ha='center', va='bottom', fontweight='bold')

ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Batch prediction example with multiple customers
batch_customers = pd.DataFrame({
    'age': [25, 45, 60],
    'job': ['student', 'admin.', 'retired'],
    'marital': ['single', 'married', 'married'],
    'education': ['university.degree', 'high.school', 'basic.4y'],
    'default': ['no', 'no', 'no'],
    'housing': ['no', 'yes', 'yes'],
    'loan': ['no', 'no', 'no'],
    'contact': ['cellular', 'cellular', 'telephone'],
    'month': ['may', 'jul', 'aug'],
    'day_of_week': ['thu', 'tue', 'mon'],
    'duration': [180, 350, 120],
    'campaign': [1, 2, 3],
    'pdays': [999, 999, 999],
    'previous': [0, 1, 0],
    'poutcome': ['nonexistent', 'success', 'nonexistent'],
    'emp.var.rate': [1.1, 1.4, -0.1],
    'cons.price.idx': [93.994, 94.465, 93.200],
    'cons.conf.idx': [-36.4, -41.8, -42.0],
    'euribor3m': [4.857, 4.959, 1.313],
    'nr.employed': [5191.0, 5228.1, 5099.1]
})

batch_predictions = model.predict(batch_customers)
batch_probabilities = model.predict_proba(batch_customers)[:, 1]

results_df = batch_customers[['age', 'job', 'education', 'duration']].copy()
results_df['Prediction'] = ['Yes' if p == 1 else 'No' for p in batch_predictions]
results_df['Probability_Yes'] = batch_probabilities

print("\n📊 BATCH PREDICTION RESULTS (3 Customers)")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

## 10. Key Insights & Recommendations

In [None]:
print("="*80)
print("🔍 LOGISTIC REGRESSION - KEY INSIGHTS")
print("="*80)

print("\n1️⃣ MODEL PERFORMANCE:")
print(f"   - ROC-AUC Score: {roc_auc:.4f}")
print(f"   - Accuracy: {accuracy:.4f}")
print(f"   - F1-Score: {f1:.4f}")
print(f"   - Model is {'well-calibrated' if 0.7 <= roc_auc <= 0.85 else 'needs tuning'}")

print("\n2️⃣ TOP POSITIVE FEATURES (Increase subscription):")
top_positive = coef_df[coef_df['Coefficient'] > 0].head(5)
for idx, row in top_positive.iterrows():
    print(f"   ✅ {row['Feature']}: {row['Coefficient']:.4f}")

print("\n3️⃣ TOP NEGATIVE FEATURES (Decrease subscription):")
top_negative = coef_df[coef_df['Coefficient'] < 0].head(5)
for idx, row in top_negative.iterrows():
    print(f"   ❌ {row['Feature']}: {row['Coefficient']:.4f}")

print("\n4️⃣ BUSINESS RECOMMENDATIONS:")
print("   📞 Focus on longer call durations - strong positive predictor")
print("   📱 Prioritize cellular contact over telephone")
print("   📊 Target customers based on economic indicators")
print("   🎯 Use probability scores to prioritize high-likelihood prospects")

print("\n5️⃣ MODEL STRENGTHS:")
print("   ✅ Interpretable - coefficients show feature impact")
print("   ✅ Fast training and prediction")
print("   ✅ Provides probability estimates")
print("   ✅ Good baseline for comparison with complex models")

print("\n" + "="*80)