In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

In [4]:
# Load preprocessed data saved from the first notebook
df = pd.read_csv(r"C:\Users\nimak\Documents\Projects\bank-marketing-classification\data\bank_marketing_processed.csv")

# Separate features and target
X = df.drop('y', axis=1)
y = df['y']

# Split the data into train and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (32950, 63), Test shape: (8238, 63)


In [5]:
# Initialize LDA model
lda = LinearDiscriminantAnalysis()

# Initialize RFE for feature selection (we'll select the top 10 features)
rfe = RFE(lda, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Print selected features
selected_features = X_train.columns[rfe.support_]
print(f"Selected features after RFE: {selected_features}")

Selected features after RFE: Index(['duration', 'emp.var.rate', 'cons.price.idx', 'euribor3m', 'month_apr',
       'month_aug', 'month_jun', 'month_mar', 'month_may', 'poutcome_success'],
      dtype='object')


In [None]:
# Cell 4: Drop highly correlated features and run GridSearchCV on LDA

from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Step 1: Drop highly correlated features (correlation > 0.95)
corr_matrix = X_train.corr().abs()  # Get absolute correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identify columns to drop
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f"Highly correlated features to drop: {to_drop}")

# Drop from train and test sets
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)

# Step 2: Define a safe LDA hyperparameter grid
# Note: Only 'lsqr' supports shrinkage, and n_components must be <= 1 for binary classification
param_grid = {
    'solver': ['lsqr'],             # 'lsqr' supports shrinkage and avoids svd/eigen pitfalls
    'shrinkage': ['auto'],          # Enables regularization
    'n_components': [1]             # Valid for binary classification (n_classes - 1)
}

# Step 3: Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=LinearDiscriminantAnalysis(),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    error_score='raise'
)

# Step 4: Fit model
grid_search.fit(X_train, y_train)

# Step 5: Output the best parameters
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

In [None]:
# Use the best model from GridSearchCV
best_lda = grid_search.best_estimator_

# Predict on the test set
y_pred = best_lda.predict(X_test)

# Print classification report
print("📊 Classification Report for Best LDA Model:")
print(classification_report(y_test, y_pred))

# Generate and display confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_lda.classes_)

disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix: Best LDA Model")
plt.grid(False)
plt.show()


In [None]:
print(f"Number of features after dropping: {X_train.shape[1]}")
features = X_train.columns.tolist()
print(f"Total features: {len(features)}")
print("Feature names:", features)
print(f"Features dropped due to high correlation: {len(to_drop)}")
print("Dropped feature names:", to_drop)

In [None]:
# Save the best LDA model
joblib.dump(best_lda, "best_lda_model.pkl")

print("✅ Model saved as best_lda_model.pkl")