# 🔧 Reusable Hyperparameter‑Tuning Workbook
Generated 2025‑04‑19 20:44

Fill the **CONFIG** cell, then run cells top‑to‑bottom. Works for tabular datasets in CSV or a synthetic demo.

In [None]:
# ===================== CONFIG =====================
DATA_PATH      = None             # e.g. 'data/credit.csv'
TARGET_COL     = 'target'         # label column name
PROBLEM_TYPE   = 'classification' # 'classification' or 'regression'
BASE_MODEL     = 'RandomForest'   # 'RandomForest'|'GradientBoosting'|'Logistic'|'Ridge'
METRIC         = 'f1'             # scoring metric for CV
N_SPLITS       = 5                # CV folds
SEARCH_STRATEGY= 'random'         # 'grid' or 'random'
N_ITER         = 50               # random-search iterations
# ==================================================


In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, warnings, time, json
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (classification_report, roc_auc_score, f1_score, accuracy_score, r2_score, mean_squared_error)
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                              GradientBoostingClassifier, GradientBoostingRegressor)
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.stats import randint, uniform, loguniform
warnings.filterwarnings('ignore')
np.random.seed(42)


In [None]:
if DATA_PATH and Path(DATA_PATH).exists():
    df = pd.read_csv(DATA_PATH)
    print(f'Loaded {df.shape} from {DATA_PATH}')
else:
    print('No DATA_PATH provided; generating synthetic dataset')
    if PROBLEM_TYPE=='classification':
        from sklearn.datasets import make_classification
        X_syn, y_syn = make_classification(n_samples=1000, n_features=20, n_informative=10, random_state=42)
    else:
        from sklearn.datasets import make_regression
        X_syn, y_syn = make_regression(n_samples=1000, n_features=20, noise=0.4, random_state=42)
    df = pd.DataFrame(X_syn, columns=[f'feat_{i}' for i in range(X_syn.shape[1])])
    df[TARGET_COL]=y_syn
df.head()


In [None]:
print('Shape:', df.shape)
print('Target distribution / stats:')
print(df[TARGET_COL].value_counts(normalize=True) if PROBLEM_TYPE=='classification' else df[TARGET_COL].describe())


In [None]:
X, y = df.drop(columns=[TARGET_COL]), df[TARGET_COL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y if PROBLEM_TYPE=='classification' else None, random_state=42)
print('Train:', X_train.shape, 'Test:', X_test.shape)


In [None]:
def get_model_and_space(model_name):
    if PROBLEM_TYPE=='classification':
        if model_name=='RandomForest':
            model=RandomForestClassifier(random_state=42)
            grid={'n_estimators':[100,300,600],
                  'max_depth':[None,5,10,20],
                  'min_samples_split':[2,5,10],
                  'min_samples_leaf':[1,2,4],
                  'max_features':['sqrt','log2',0.5]}
            dist={'n_estimators':randint(200,1000),
                  'max_depth':randint(3,25),
                  'min_samples_split':randint(2,15),
                  'min_samples_leaf':randint(1,10),
                  'max_features':uniform(0.2,0.8)}
        elif model_name=='GradientBoosting':
            model=GradientBoostingClassifier(random_state=42)
            grid={'n_estimators':[100,300,500],
                  'learning_rate':[0.01,0.05,0.1],
                  'max_depth':[2,3,4],
                  'subsample':[0.6,0.8,1.0]}
            dist={'n_estimators':randint(100,600),
                  'learning_rate':loguniform(1e-3,0.2),
                  'max_depth':randint(2,6),
                  'subsample':uniform(0.5,0.5)}
        elif model_name=='Logistic':
            model=LogisticRegression(max_iter=1000, solver='liblinear')
            grid={'C':[0.01,0.1,1,10],'penalty':['l1','l2']}
            dist={'C':loguniform(1e-3,10),'penalty':['l1','l2']}
        else:
            raise ValueError('Unsupported model')
    else:
        if model_name=='RandomForest':
            model=RandomForestRegressor(random_state=42)
            grid={'n_estimators':[200,400,800],
                  'max_depth':[None,5,10,20],
                  'min_samples_split':[2,5,10],
                  'min_samples_leaf':[1,2,4],
                  'max_features':['sqrt','log2',0.6]}
            dist={'n_estimators':randint(200,1200),
                  'max_depth':randint(3,30),
                  'min_samples_split':randint(2,15),
                  'min_samples_leaf':randint(1,10),
                  'max_features':uniform(0.3,0.7)}
        elif model_name=='GradientBoosting':
            model=GradientBoostingRegressor(random_state=42)
            grid={'n_estimators':[100,300,500],
                  'learning_rate':[0.01,0.05,0.1],
                  'max_depth':[2,3,4],
                  'subsample':[0.6,0.8,1.0]}
            dist={'n_estimators':randint(100,600),
                  'learning_rate':loguniform(1e-3,0.2),
                  'max_depth':randint(2,6),
                  'subsample':uniform(0.5,0.5)}
        elif model_name=='Ridge':
            model=Ridge()
            grid={'alpha':[0.1,1,10,50]}
            dist={'alpha':loguniform(1e-3,100)}
        else:
            raise ValueError('Unsupported model')
    return model, grid, dist

model, grid_params, dist_params = get_model_and_space(BASE_MODEL)
print(model)


In [None]:
print('\n🔹 Baseline training...')
model.fit(X_train, y_train)
if PROBLEM_TYPE=='classification':
    pred=model.predict(X_test)
    baseline=f1_score(y_test,pred) if METRIC=='f1' else accuracy_score(y_test,pred)
    print(f'Baseline {METRIC}:', baseline)
else:
    pred=model.predict(X_test)
    baseline=r2_score(y_test,pred)
    print('Baseline R2:', baseline)


In [None]:
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42) if PROBLEM_TYPE=='classification'      else KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)


In [None]:
if SEARCH_STRATEGY=='grid':
    searcher=GridSearchCV(model, grid_params, scoring=METRIC, cv=cv, n_jobs=-1, verbose=1)
else:
    searcher=RandomizedSearchCV(model, dist_params, n_iter=N_ITER, scoring=METRIC,
                                cv=cv, n_jobs=-1, random_state=42, verbose=1)
searcher.fit(X_train, y_train)
print('Best params:', searcher.best_params_)
print('Best CV score:', searcher.best_score_)
best_model=searcher.best_estimator_


In [None]:
if PROBLEM_TYPE=='classification':
    y_pred=best_model.predict(X_test)
    if hasattr(best_model,'predict_proba'):
        y_prob=best_model.predict_proba(X_test)[:,1]
        print('Test ROC‑AUC:', roc_auc_score(y_test,y_prob))
    print(classification_report(y_test,y_pred))
else:
    y_pred=best_model.predict(X_test)
    print('Test R2:', r2_score(y_test,y_pred))
    print('Test RMSE:', np.sqrt(mean_squared_error(y_test,y_pred)))


In [None]:
import joblib, os, datetime, json
os.makedirs('models',exist_ok=True)
fname=f"models/{BASE_MODEL.lower()}_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.pkl"
joblib.dump(best_model,fname)
print('Saved best model ➜', fname)
