<a href="https://colab.research.google.com/github/Venkatpotla33/Machine-Learning-Lab/blob/main/ML_Lab_06_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')  # Authorize


Mounted at /content/drive


In [None]:
# Paths, Imports & Dataset

import os, joblib, numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import loguniform, randint

BASE_DIR = '/content/drive/MyDrive/ensemble_tuning'
os.makedirs(BASE_DIR, exist_ok=True)
print('Saving artifacts to:', BASE_DIR)

# Complex dataset
X, y = make_classification(n_samples=4000, n_features=40, n_informative=15, n_redundant=10,
                           n_classes=3, weights=[0.55, 0.30, 0.15], class_sep=1.2,
                           flip_y=0.02, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=7)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

def report_best(name, search):
    print(f"\n{name} — Best Params:\n", search.best_params_)
    print(f"CV Best Score (macro-F1): {search.best_score_:.4f}")
    best = search.best_estimator_
    y_pred = best.predict(X_test)
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test Macro-F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
    return best


Saving artifacts to: /content/drive/MyDrive/ensemble_tuning


In [None]:
# AdaBoost + DecisionTree (GridSearchCV)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib, os

base_dt = DecisionTreeClassifier(random_state=0)
ada_dt = AdaBoostClassifier(estimator=base_dt, random_state=0)

param_grid = {
    'estimator__max_depth': [1, 2, 3, 4],
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}

gs_ada_dt = GridSearchCV(ada_dt, param_grid=param_grid, scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1)
gs_ada_dt.fit(X_train, y_train)
best_ada_dt = report_best('AdaBoost + DecisionTree (Grid)', gs_ada_dt)

joblib.dump(best_ada_dt, os.path.join(BASE_DIR, 'ada_dt_best_grid.joblib'))
print('Saved:', os.path.join(BASE_DIR, 'ada_dt_best_grid.joblib'))


Fitting 5 folds for each of 80 candidates, totalling 400 fits

AdaBoost + DecisionTree (Grid) — Best Params:
 {'estimator__max_depth': 4, 'learning_rate': 0.5, 'n_estimators': 300}
CV Best Score (macro-F1): 0.8529
Test Accuracy: 0.8910
Test Macro-F1: 0.8563
Saved: /content/drive/MyDrive/ensemble_tuning/ada_dt_best_grid.joblib


In [None]:
# Bagging + DecisionTree (GridSearchCV)
from sklearn.ensemble import BaggingClassifier

base_dt2 = DecisionTreeClassifier(random_state=0)
bag_dt = BaggingClassifier(estimator=base_dt2, random_state=0, n_jobs=-1)

param_grid = {
    'estimator__max_depth': [None, 5, 10, 15],
    'n_estimators': [50, 100, 200, 300],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

gs_bag_dt = GridSearchCV(bag_dt, param_grid=param_grid, scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1)
gs_bag_dt.fit(X_train, y_train)
best_bag_dt = report_best('Bagging + DecisionTree (Grid)', gs_bag_dt)

joblib.dump(best_bag_dt, os.path.join(BASE_DIR, 'bag_dt_best_grid.joblib'))
print('Saved:', os.path.join(BASE_DIR, 'bag_dt_best_grid.joblib'))


Fitting 5 folds for each of 144 candidates, totalling 720 fits

Bagging + DecisionTree (Grid) — Best Params:
 {'estimator__max_depth': None, 'max_features': 0.7, 'max_samples': 1.0, 'n_estimators': 300}
CV Best Score (macro-F1): 0.8088
Test Accuracy: 0.8670
Test Macro-F1: 0.8170
Saved: /content/drive/MyDrive/ensemble_tuning/bag_dt_best_grid.joblib


In [None]:
# Bagging + SVM (RandomizedSearchCV)
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


svm_base2 = Pipeline([('scaler', StandardScaler()), ('svc', SVC(random_state=0))])
bag_svm = BaggingClassifier(estimator=svm_base2, random_state=0, n_jobs=-1)

param_distributions = {
    'n_estimators': randint(25, 150),
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'estimator__svc__C': loguniform(1e-2, 1e2),
    'estimator__svc__kernel': ['rbf', 'linear'],
    'estimator__svc__gamma': ['scale', 'auto']
}

rs_bag_svm = RandomizedSearchCV(bag_svm, param_distributions=param_distributions,
                                n_iter=40, scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1, random_state=0)
rs_bag_svm.fit(X_train, y_train)
best_bag_svm = report_best('Bagging + SVM (Randomized)', rs_bag_svm)

joblib.dump(best_bag_svm, os.path.join(BASE_DIR, 'bag_svm_best_rand.joblib'))
print('Saved:', os.path.join(BASE_DIR, 'bag_svm_best_rand.joblib'))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [None]:
# Load & Compare Saved Models
import joblib, os
from sklearn.metrics import accuracy_score, f1_score

paths = [
    'ada_dt_best_grid.joblib',
    'ada_svm_best_rand.joblib',
    'bag_dt_best_grid.joblib',
    'bag_svm_best_rand.joblib'
]

for fname in paths:
    fpath = os.path.join(BASE_DIR, fname)
    clf = joblib.load(fpath)
    y_pred = clf.predict(X_test)
    print(f"\n{fname}")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print("Test Macro-F1:", f1_score(y_test, y_pred, average='macro'))
