In [None]:
# Import necessary libraries
import pickle as pkl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Model imports
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
# from lightgbm import LGBMClassifier

# Pipeline and preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Evaluation and cross-validation
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix, make_scorer

print("All libraries imported successfully!")

All libraries imported successfully!


## Load Preprocessed Data
Load the cleaned and preprocessed data from the main notebook.

In [None]:
# Load the preprocessed data that was saved from project.ipynb
# Note: You'll need to save X_train_pca, X_test_pca, y_train, y_test from project.ipynb first

# For now, we'll load from the original dataset and reprocess
import kagglehub

dataset_path = kagglehub.dataset_download("lakshmi25npathi/santander-customer-transaction-prediction-dataset")
df = pd.read_csv(os.path.join(dataset_path, "train.csv"))

print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['target'].value_counts()}")

  from .autonotebook import tqdm as notebook_tqdm


Dataset shape: (200000, 202)
Target distribution:
target
0    179902
1     20098
Name: count, dtype: int64


In [None]:
# Data preprocessing (same as in project.ipynb)
from sklearn.model_selection import train_test_split

# Remove ID_code
df = df.drop(columns=['ID_code'])

# Remove outliers using IQR method
outlier_indices = set()

for col in df.columns.drop('target'):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outlier_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
    outlier_indices.update(df[outlier_mask].index.tolist())

df_cleaned = df.drop(index=outlier_indices)
print(f"After removing outliers: {df_cleaned.shape}")

# Split into X and y
X = df_cleaned.drop(columns=['target'])
y = df_cleaned['target']

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print("Data preprocessing complete!")

After removing outliers: (175104, 201)
Training set shape: (131328, 200)
Test set shape: (43776, 200)
Data preprocessing complete!
Training set shape: (131328, 200)
Test set shape: (43776, 200)
Data preprocessing complete!


## Define Models
Create a dictionary of models with their hyperparameters for cross-validation.

## Create Pipelines
Create pipelines with StandardScaler, PCA, and different classifiers to prevent data leakage and ensure proper preprocessing.

### Precompute vs Pipeline — which to use?

- **When using cross-validation or GridSearchCV:** Always keep `StandardScaler` and `PCA` inside a `Pipeline`. This ensures the scaler and PCA are fitted only on each training fold and prevents data leakage from validation folds.

- **When you have chosen a final model and want speed for repeated training/evaluation:** You may *fit the scaler and PCA once on the training set* and reuse the transformed arrays for training and testing. This is safe only if the scaler/PCA were fit strictly on the training set (not on the whole dataset) and you are no longer doing CV that would require re-fitting preprocessing inside folds.

Below is example code demonstrating both patterns and showing how to safely precompute transforms for final training.

In [None]:
# Example: Precompute scaler + PCA once on training set (for final training/evaluation)
# WARNING: Do NOT use these precomputed arrays for cross-validation, because that would leak
# information from validation folds if PCA was fit on the whole training set.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Fit scaler and PCA on training data only
single_scaler = StandardScaler()
single_pca = PCA(n_components=0.95)

X_train_scaled_once = single_scaler.fit_transform(X_train)
X_train_pca_once = single_pca.fit_transform(X_train_scaled_once)

X_test_scaled_once = single_scaler.transform(X_test)
X_test_pca_once = single_pca.transform(X_test_scaled_once)

print(f"Precomputed PCA shape: {X_train_pca_once.shape}")

# Train a model once using precomputed arrays (fast)
LG = LogisticRegression(random_state=42, n_jobs=-1)
LG.fit(X_train_pca_once, y_train)

y_pred = LG.predict(X_test_pca_once)
print("\nClassification report (using precomputed scaler+PCA):")
print(classification_report(y_test, y_pred))

# If you will run CV/hyperparameter tuning: use pipelines (prevents leakage)
print('\nReminder: For cross-validation/GridSearch use the pipelines defined earlier so scaler and PCA are fitted inside each fold.')

Precomputed PCA shape: (131328, 190)

Classification report (using precomputed scaler+PCA):
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     39500
           1       0.66      0.25      0.36      4276

    accuracy                           0.91     43776
   macro avg       0.79      0.62      0.66     43776
weighted avg       0.90      0.91      0.90     43776


Reminder: For cross-validation/GridSearch use the pipelines defined earlier so scaler and PCA are fitted inside each fold.

Classification report (using precomputed scaler+PCA):
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     39500
           1       0.66      0.25      0.36      4276

    accuracy                           0.91     43776
   macro avg       0.79      0.62      0.66     43776
weighted avg       0.90      0.91      0.90     43776


Reminder: For cross-validation/GridSearch use the pipelines defined earlie

### Use copies + hold-out test set (safe workflow)

- Make deep copies of your dataframes/arrays before doing experimental transforms so the original data remains untouched.
- Use a single held-out test set and only evaluate on it once at the very end. Do not use the test set for hyperparameter tuning or model selection.
- Split the training set into **train** and **validation** for hyperparameter tuning (or run inner CV on the train split). After selecting hyperparameters on the validation set, retrain the final model on train+validation, then evaluate once on the held-out test set.

Below is example code that:
1. Creates safe copies, 2. Splits training into train/validation, 3. Runs GridSearch on the train split only, 4. Evaluates on validation, 5. Retrains on train+val and evaluates once on the test holdout.

In [None]:
# Safe workflow example: copies, train/validation/test split, tuning on train only, final evaluation on test
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score


In [None]:

# 1) Work on copies to avoid changing originals
X_train_full = X_train.copy()
y_train_full = y_train.copy()
X_test_holdout = X_test.copy()
y_test_holdout = y_test.copy()

In [None]:

# 2) Create train / validation split from the training set (keep test aside)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print(f"Shapes -> train: {X_tr.shape}, val: {X_val.shape}, test_holdout: {X_test_holdout.shape}")

Shapes -> train: (105062, 200), val: (26266, 200), test_holdout: (43776, 200)


In [None]:
# 3) Example: hyperparameter search on the train split only (with internal CV)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("clf", LogisticRegression(max_iter=1000, random_state=42))
])

param_grid = {
    'clf__C': [0.01, 0.1, 1, 5, 10],
    'clf__class_weight': [None, 'balanced']
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring=make_scorer(f1_score), n_jobs=-1)

grid.fit(X_tr, y_tr)
print('\nGridSearch best params:', grid.best_params_)
y_val_pred = grid.predict(X_val)
print("\nValidation classification report:")
print(classification_report(y_val, y_val_pred))


GridSearch best params: {'clf__C': 1, 'clf__class_weight': 'balanced'}

Validation classification report:
              precision    recall  f1-score   support

           0       0.97      0.78      0.87     23700
           1       0.28      0.76      0.41      2566

    accuracy                           0.78     26266
   macro avg       0.62      0.77      0.64     26266
weighted avg       0.90      0.78      0.82     26266



In [None]:
# 4) Evaluate chosen hyperparameters on validation set (this is the model-selection step)
val_pred = grid.best_estimator_.predict(X_val)
val_f1 = f1_score(y_val, val_pred)
print(f"Validation F1: {val_f1:.4f}")

Validation F1: 0.4060


In [None]:
# 5) Retrain final model on train + validation (combine splits) using chosen hyperparameters
X_combined = pd.concat([X_tr, X_val])
y_combined = pd.concat([y_tr, y_val])

final_model = grid.best_estimator_
final_model.fit(X_combined, y_combined)

0,1,2
,steps,"[('scaler', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [None]:
# SMOTE pipeline: oversample inside the pipeline (safe for CV) and end-to-end evaluation
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Build pipeline that includes SMOTE (sampling happens only on training folds)
pipe_smote = ImbPipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid_smote = {
    'clf__C': [0.01, 0.1, 1],
    'clf__class_weight': [None, 'balanced']
}

# Grid search on the training split (X_tr, y_tr) produced earlier
grid_smote = GridSearchCV(pipe_smote, param_grid_smote, cv=3, scoring=make_scorer(f1_score), n_jobs=-1)
print('Running GridSearch with SMOTE on train split...')
grid_smote.fit(X_tr, y_tr)
print('\nBest CV F1 (SMOTE pipeline):', grid_smote.best_score_)
print('Best params:', grid_smote.best_params_)

# Evaluate best estimator on validation set
y_val_pred_smote = grid_smote.predict(X_val)
val_f1_smote = f1_score(y_val, y_val_pred_smote)
print('\nValidation classification report (SMOTE pipeline):')
print(classification_report(y_val, y_val_pred_smote))
print(f'Validation F1 (SMOTE): {val_f1_smote:.4f}')

# Retrain on train + validation and evaluate once on the held-out test set
X_comb = pd.concat([X_tr, X_val])
Y_comb = pd.concat([y_tr, y_val])
final_smote = grid_smote.best_estimator_
final_smote.fit(X_comb, Y_comb)

# # # Final evaluation on test holdout
# # y_test_pred_smote = final_smote.predict(X_test_holdout)
# # print('\nTest set classification report (final SMOTE model):')
# # print(classification_report(y_test_holdout, y_test_pred_smote))
# # print(f'Final Test F1 (SMOTE): {f1_score(y_test_holdout, y_test_pred_smote):.4f}')

# # Save the final model
# pkl.dump(final_smote, open('final_smote_model.pkl', 'wb'))
# print('\nSaved final model as final_smote_model.pkl')

In [None]:


# Save final model
# pkl.dump(final_model, open('final_model_trainval_test_holdout.pkl', 'wb'))
# print('\nSaved final model to final_model_trainval_test_holdout.pkl')

In [None]:
# Define pipelines with StandardScaler and PCA for each model
pipelines = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', LogisticRegression(C=1, class_weight='balanced', max_iter=1000, random_state=42))
    ]),
    
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', KNeighborsClassifier(n_neighbors=5, weights='uniform'))
    ]),
    
    'Naive Bayes': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', GaussianNB())
    ]),
    
    'LinearSVC': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', LinearSVC(C=1, class_weight='balanced', random_state=42, max_iter=2000))
    ]),
    
    'SGD Classifier': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', SGDClassifier(alpha=0.001, class_weight='balanced', loss='log_loss', random_state=42, max_iter=1000))
    ]),
    
    'Random Forest': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1))
    ])
}

print("Pipelines created:")
for name in pipelines.keys():
    print(f"  - {name}")

Pipelines created:
  - Logistic Regression
  - KNN
  - Naive Bayes
  - LinearSVC
  - SGD Classifier


## Cross-Validation with Multiple Metrics
Run cross-validation with F1, Precision, Recall, and Accuracy scores.

In [None]:
# Setup cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
scoring = {
    'f1': make_scorer(f1_score),
    'precision': 'precision',
    'recall': 'recall',
    'accuracy': 'accuracy'
}

# Store results
cv_results = {}

print("Running cross-validation with pipelines...\n")
for model_name, pipeline in pipelines.items():
    print(f"Testing {model_name}...", end=' ')
    scores = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring, n_jobs=-1)
    cv_results[model_name] = scores
    print("Done!")

print("\nCross-validation complete!")

## Results Summary
Display comprehensive cross-validation results.

In [None]:
# Create summary dataframe
summary_data = []

for model_name, scores in cv_results.items():
    summary_data.append({
        'Model': model_name,
        'F1 (mean)': scores['test_f1'].mean(),
        'F1 (std)': scores['test_f1'].std(),
        'Precision (mean)': scores['test_precision'].mean(),
        'Recall (mean)': scores['test_recall'].mean(),
        'Accuracy (mean)': scores['test_accuracy'].mean()
    })

summary_df = pd.DataFrame(summary_data).sort_values('F1 (mean)', ascending=False)

print("\n" + "="*100)
print("CROSS-VALIDATION RESULTS (5-Fold Stratified)")
print("="*100)
print(summary_df.to_string(index=False))
print("="*100)

In [None]:
# Visualize F1 scores
plt.figure(figsize=(12, 6))

# Bar plot of F1 scores
f1_means = summary_df.set_index('Model')['F1 (mean)'].sort_values(ascending=True)
f1_stds = summary_df.set_index('Model').loc[f1_means.index, 'F1 (std)']

plt.barh(range(len(f1_means)), f1_means.values, xerr=f1_stds.values, capsize=5, alpha=0.7, color='steelblue')
plt.yticks(range(len(f1_means)), f1_means.index)
plt.xlabel('F1 Score')
plt.title('Model Performance - F1 Score with Standard Deviation (5-Fold CV)')
plt.xlim([0, 1])
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Box plot of F1 scores across folds
f1_data = [cv_results[model]['test_f1'] for model in summary_df['Model']]

plt.figure(figsize=(12, 6))
plt.boxplot(f1_data, labels=summary_df['Model'], vert=False)
plt.xlabel('F1 Score')
plt.title('F1 Score Distribution Across 5 Folds')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## Test Set Evaluation
Evaluate the best model(s) on the held-out test set.

In [None]:
# Get the best model
best_model_name = summary_df.iloc[0]['Model']
best_f1_cv = summary_df.iloc[0]['F1 (mean)']

print(f"\nBest Model (by CV F1): {best_model_name}")
print(f"CV F1 Score: {best_f1_cv:.4f}")

# Train the best pipeline on full training data
best_pipeline = pipelines[best_model_name]
best_pipeline.fit(X_train, y_train)

# Predict on test set
y_pred_best = best_pipeline.predict(X_test)

# Calculate test metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

test_f1 = f1_score(y_test, y_pred_best)
test_precision = precision_score(y_test, y_pred_best)
test_recall = recall_score(y_test, y_pred_best)
test_accuracy = accuracy_score(y_test, y_pred_best)

print(f"\n" + "="*50)
print(f"Test Set Performance - {best_model_name}")
print("="*50)
print(f"F1 Score:     {test_f1:.4f}")
print(f"Precision:    {test_precision:.4f}")
print(f"Recall:       {test_recall:.4f}")
print(f"Accuracy:     {test_accuracy:.4f}")
print("="*50)

In [None]:
# Detailed classification report
print(f"\nDetailed Classification Report - {best_model_name}:")
print(classification_report(y_test, y_pred_best))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print(f"\nConfusion Matrix:")
print(f"True Negatives:  {cm[0, 0]}")
print(f"False Positives: {cm[0, 1]}")
print(f"False Negatives: {cm[1, 0]}")
print(f"True Positives:  {cm[1, 1]}")

## Save Best Model
Save the best model for future use.

In [None]:
# Save the best pipeline
model_filename = f'{best_model_name.replace(" ", "_")}_pipeline_cv_best_f1_{test_f1:.4f}.pkl'
pkl.dump(best_pipeline, open(model_filename, 'wb'))

print(f"\nBest pipeline saved as: {model_filename}")
print(f"\nFinal Summary:")
print(f"  Model: {best_model_name}")
print(f"  CV F1 Score: {best_f1_cv:.4f}")
print(f"  Test F1 Score: {test_f1:.4f}")
print(f"\nPipeline Steps:")
for step_name, step in best_pipeline.steps:
    print(f"  - {step_name}: {step.__class__.__name__}")

In [26]:
# --- Add SVM and LightGBM (imbalance-aware) and update cv_results ---
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

# compute scale_pos_weight for LightGBM using training set
pos_count = int(y_train.sum())
neg_count = len(y_train) - pos_count
scale_pos_weight = neg_count / max(pos_count, 1)
print(f"scale_pos_weight for LightGBM: {scale_pos_weight:.2f}")

imbalance_pipelines = {
    'LinearSVC (class_weight)': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('clf', LinearSVC(class_weight='balanced', max_iter=5000, random_state=42))
    ]),

    'LinearSVC (SMOTE)': ImbPipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('smote', SMOTE(random_state=42)),
        ('clf', LinearSVC(max_iter=5000, random_state=42))
    ]),

    'LightGBM (scale_pos_weight)': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('clf', LGBMClassifier(n_estimators=200, random_state=42, scale_pos_weight=scale_pos_weight, n_jobs=-1))
    ]),

    'LightGBM (SMOTE)': ImbPipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('smote', SMOTE(random_state=42)),
        ('clf', LGBMClassifier(n_estimators=200, random_state=42, n_jobs=-1))
    ])
}

print('Running CV for imbalance-aware pipelines...')
for name, pipe in imbalance_pipelines.items():
    print(f"Testing {name}...", end=' ')
    scores = cross_validate(pipe, X_train, y_train, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
    # cross_validate returns dict with key 'test_score' when a single scorer is used
    cv_results[name] = scores
    # determine mean f1 from whichever key is present
    if 'test_score' in scores:
        mean_f1 = np.mean(scores['test_score'])
    elif 'test_f1' in scores:
        mean_f1 = np.mean(scores['test_f1'])
    else:
        mean_f1 = np.nan
    print(f"Done — mean F1: {mean_f1:.4f}")

# Recompute and print a summary table of all models (updated)
summary_data = []
for model_name, scores in cv_results.items():
    # handle both possible key names without using boolean ops on arrays
    f1_vals = scores.get('test_score') if scores.get('test_score') is not None else scores.get('test_f1')
    precision_vals = scores.get('test_precision') if scores.get('test_precision') is not None else None
    recall_vals = scores.get('test_recall') if scores.get('test_recall') is not None else None
    acc_vals = scores.get('test_accuracy') if scores.get('test_accuracy') is not None else None

    summary_data.append({
        'Model': model_name,
        'F1 (mean)': float(np.mean(f1_vals)) if f1_vals is not None else np.nan,
        'F1 (std)': float(np.std(f1_vals)) if f1_vals is not None else np.nan,
        'Precision (mean)': float(np.mean(precision_vals)) if precision_vals is not None else np.nan,
        'Recall (mean)': float(np.mean(recall_vals)) if recall_vals is not None else np.nan,
        'Accuracy (mean)': float(np.mean(acc_vals)) if acc_vals is not None else np.nan
    })

updated_summary_df = pd.DataFrame(summary_data).sort_values('F1 (mean)', ascending=False)
print('\n' + '='*80)
print('UPDATED CROSS-VALIDATION RESULTS (including imbalance-aware models)')
print('='*80)
print(updated_summary_df.to_string(index=False))
print('='*80)

# Note: the main Results Summary cell later in the notebook can be re-run to keep notebook state consistent.

scale_pos_weight for LightGBM: 9.24
Running CV for imbalance-aware pipelines...
Testing LinearSVC (class_weight)... Done — mean F1: 0.4090
Testing LinearSVC (SMOTE)... Done — mean F1: 0.4090
Testing LinearSVC (SMOTE)... Done — mean F1: 0.4105
Testing LightGBM (scale_pos_weight)... Done — mean F1: 0.4105
Testing LightGBM (scale_pos_weight)... Done — mean F1: 0.4541
Testing LightGBM (SMOTE)... Done — mean F1: 0.4541
Testing LightGBM (SMOTE)... Done — mean F1: 0.4081

UPDATED CROSS-VALIDATION RESULTS (including imbalance-aware models)
                      Model  F1 (mean)  F1 (std)  Precision (mean)  Recall (mean)  Accuracy (mean)
LightGBM (scale_pos_weight)   0.454142  0.005148               NaN            NaN              NaN
                Naive Bayes   0.436095  0.010706          0.725785       0.311795         0.921258
          LinearSVC (SMOTE)   0.410452  0.002942               NaN            NaN              NaN
                  LinearSVC   0.408996  0.002568          0.277989