In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

# Question 3: Hyperparameter Tuning & Model Optimization
## Scenario
-After deploying your classification model, stakeholders ask you to improve performance without changing the dataset.

-You decide to optimize the model using systematic hyperparameter search techniques.

# Dataset
-Reuse the dataset from Question 2.

# Tasks
- 1 Explain the difference between parameters and hyperparameters.
- 2 Train a baseline Logistic Regression model.
- 3 Apply GridSearchCV to tune hyperparameters (C, penalty, solver).
- 4 Apply RandomizedSearchCV and compare results.
- 5 Compare performance before and after tuning.
- 6 Discuss trade-offs between computational cost and model performance.

## 1 Explain the difference between parameters and hyperparameters 
### A Parameters
 - Learned from data during training.
 - Model “internal weights”.
 - Example in Logistic Regression: coefficients (β) and intercept (β0).
### B Hyperparameters
 - Set before training (you choose them).
 - Control how the model learns / regularizes.
 - Examples in Logistic Regression:
   - C (regularization strength)
   -  penalty (l1, l2, elasticnet, none)
   - solver (liblinear, lbfgs, saga, etc.)
   - max_iter


## Encoding and Feature Scaling 

In [19]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [20]:
X = data.drop('Attrition',axis=1)
y = data['Attrition']

In [21]:
y = y.map({'Yes': 1, 'No': 0})


In [22]:
cat_cols = X.select_dtypes(include='object').columns
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)


In [23]:
num_cols = X_encoded.select_dtypes(include=['int64', 'float64']).columns
print(num_cols)

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)


In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## 2 Train a baseline Logistic Regression model.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Baseline model
baseline = LogisticRegression(max_iter=500, random_state=42)

baseline.fit(X_train, y_train)

y_pred_base = baseline.predict(X_test)
y_proba_base = baseline.predict_proba(X_test)[:, 1]

print("BASELINE Confusion Matrix:\n", confusion_matrix(y_test, y_pred_base))
print("\nBASELINE Classification Report:\n", classification_report(y_test, y_pred_base))
print("BASELINE ROC-AUC:", roc_auc_score(y_test, y_proba_base))


BASELINE Confusion Matrix:
 [[238   9]
 [ 31  16]]

BASELINE Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.64      0.34      0.44        47

    accuracy                           0.86       294
   macro avg       0.76      0.65      0.68       294
weighted avg       0.85      0.86      0.85       294

BASELINE ROC-AUC: 0.8094581789990525


## 3 Apply GridSearchCV to tune hyperparameters (C, penalty, solver).

In [27]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = [
    {"solver": ["liblinear"], "penalty": ["l1", "l2"], "C": [0.01, 0.1, 1, 10, 100]},
    {"solver": ["lbfgs"], "penalty": ["l2"], "C": [0.01, 0.1, 1, 10, 100]},
    {"solver": ["saga"], "penalty": ["l1", "l2"], "C": [0.01, 0.1, 1, 10, 100]},
]

grid = GridSearchCV(
    LogisticRegression(max_iter=2000, random_state=42),
    param_grid=param_grid,
    scoring="f1",        
    cv=cv,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best GridSearch Params:", grid.best_params_)
print("Best GridSearch CV Score (F1):", grid.best_score_)

best_grid = grid.best_estimator_
y_pred_grid = best_grid.predict(X_test)
y_proba_grid = best_grid.predict_proba(X_test)[:, 1]

print("\nGRID Confusion Matrix:\n", confusion_matrix(y_test, y_pred_grid))
print("\nGRID Classification Report:\n", classification_report(y_test, y_pred_grid))
print("GRID ROC-AUC:", roc_auc_score(y_test, y_proba_grid))


Best GridSearch Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best GridSearch CV Score (F1): 0.5729401154401155

GRID Confusion Matrix:
 [[239   8]
 [ 31  16]]

GRID Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.92       247
           1       0.67      0.34      0.45        47

    accuracy                           0.87       294
   macro avg       0.78      0.65      0.69       294
weighted avg       0.85      0.87      0.85       294

GRID ROC-AUC: 0.801016452752175


## 4 Apply RandomizedSearchCV and compare results 

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_dist = {
    "solver": ["liblinear", "lbfgs", "saga"],
    "penalty": ["l1", "l2"],  # randomized will try invalid combos sometimes; we'll handle by error_score
    "C": loguniform(1e-3, 1e3),
}

rand = RandomizedSearchCV(
    LogisticRegression(max_iter=2000, random_state=42),
    param_distributions=param_dist,
    n_iter=30,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    error_score="raise"  # if this raises due to invalid combo, set to np.nan
)

# If you get errors due to invalid (solver, penalty) combos:
# change error_score=np.nan and rerun
try:
    rand.fit(X_train, y_train)
except Exception as e:
    print("RandomizedSearch error (likely invalid combo). Fixing by using error_score=np.nan...")
    import numpy as np
    rand = RandomizedSearchCV(
        LogisticRegression(max_iter=2000, random_state=42),
        param_distributions=param_dist,
        n_iter=30,
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        random_state=42,
        error_score=np.nan
    )
    rand.fit(X_train, y_train)

print("Best Randomized Params:", rand.best_params_)
print("Best Randomized CV Score (F1):", rand.best_score_)

best_rand = rand.best_estimator_
y_pred_rand = best_rand.predict(X_test)
y_proba_rand = best_rand.predict_proba(X_test)[:, 1]

print("\nRANDOM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rand))
print("\nRANDOM Classification Report:\n", classification_report(y_test, y_pred_rand))
print("RANDOM ROC-AUC:", roc_auc_score(y_test, y_proba_rand))


RandomizedSearch error (likely invalid combo). Fixing by using error_score=np.nan...


40 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nepal\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nepal\anaconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\nepal\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1218, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\nepal\anaconda3\Lib\site-packages\skl

Best Randomized Params: {'C': np.float64(4.0428727350273315), 'penalty': 'l2', 'solver': 'saga'}
Best Randomized CV Score (F1): 0.5612055723290428

RANDOM Confusion Matrix:
 [[237  10]
 [ 31  16]]

RANDOM Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.62      0.34      0.44        47

    accuracy                           0.86       294
   macro avg       0.75      0.65      0.68       294
weighted avg       0.84      0.86      0.84       294

RANDOM ROC-AUC: 0.8091997588078215


## 5 Compare performance before and after tuning

In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def metrics(y_true, y_pred, y_proba):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, y_proba),
    }

base_m = metrics(y_test, y_pred_base, y_proba_base)
grid_m = metrics(y_test, y_pred_grid, y_proba_grid)
rand_m = metrics(y_test, y_pred_rand, y_proba_rand)

print("Baseline:", base_m)
print("GridSearch:", grid_m)
print("RandomSearch:", rand_m)


Baseline: {'Accuracy': 0.8639455782312925, 'Precision': 0.64, 'Recall': 0.3404255319148936, 'F1': 0.4444444444444444, 'ROC-AUC': 0.8094581789990525}
GridSearch: {'Accuracy': 0.8673469387755102, 'Precision': 0.6666666666666666, 'Recall': 0.3404255319148936, 'F1': 0.4507042253521127, 'ROC-AUC': 0.801016452752175}
RandomSearch: {'Accuracy': 0.8605442176870748, 'Precision': 0.6153846153846154, 'Recall': 0.3404255319148936, 'F1': 0.4383561643835616, 'ROC-AUC': 0.8091997588078215}


## 6 Discuss trade-offs between computational cost and model performance.

In [30]:
# There is a trade-off between computational cost and model performance because more exhaustive hyperparameter searches require significantly more computation while providing only marginal performance improvements. GridSearchCV explores all parameter combinations and may achieve slightly better performance but at high computational cost. RandomizedSearchCV evaluates fewer combinations, reducing cost while still obtaining near-optimal performance. Therefore, in practice, RandomizedSearchCV is often preferred to balance efficiency and performance.