In [26]:
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from scipy.stats import randint
import time

In [27]:
data = load_breast_cancer()
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [28]:
df = pd.DataFrame(data.data, columns = data.feature_names)

In [29]:
df["Target"] = data.target

In [30]:
X = df.drop("Target", axis=1)
y = df["Target"]

# train_test_split Parameters:
**arrays**
    Purpose: Data to be split (e.g., features X and target y).

**test_size**
    Purpose: Proportion or absolute number of data to use for the test set (e.g., 0.2 for 20%).
    Usage: Can be float (proportion) or int (absolute number).

**train_size**
    Purpose: Proportion or number of data for the training set.
    Usage: If None, automatically set as 1 - test_size.

**random_state**
    Purpose: Controls the randomness of the split to ensure reproducibility.
    Usage: Set a fixed number (e.g., 42) for consistent splits.

**shuffle**
    Purpose: Whether to shuffle the data before splitting.
    Usage: Set to False for time-series or ordered data.

**stratify**
    Purpose: Ensures class distribution is the same in both train and test sets.
    Usage: Use for classification tasks to preserve label proportions (stratify=y).

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [32]:
rf = RandomForestClassifier(random_state=42)

In [37]:
params_grid = {
    'n_estimators' : [50,100,200],
    'max_depth' : [5,10,15],
    'min_samples_split' : [2,5,10]
}

In [62]:
start_grid = time.time()
grid_search = GridSearchCV(rf, params_grid, cv = 5, n_jobs = -1, verbose=1)
grid_search.fit(X_train, y_train)
end_grid = time.time()

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [63]:
print("\n🧪 GridSearchCV Results:")
print("Best Params:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, grid_search.predict(X_test)))
print(f"Time taken: {end_grid - start_grid:.2f} seconds")


🧪 GridSearchCV Results:
Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 0.9649122807017544
Time taken: 2.55 seconds


In [64]:
param_dist = {
    'n_estimators': randint(0,200),
    'max_depth': (0,15),
    'min_samples_split': (0,10)
}

In [65]:
start_random = time.time()
# random_search = RandomizedSearchCV(rf, param_dist)
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=10, cv=5, n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)
end_random = time.time()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/shuaib/anaconda3/envs/myenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/shuaib/anaconda3/envs/myenv/lib/python3.10/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/shuaib/anaconda3/envs/myenv/lib/python3.10/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/home/shuaib/anaconda3/envs/myenv/lib/python3.10/site-packages/sklearn/utils/_param_valid

In [66]:
print("\n🎲 RandomizedSearchCV Results:")
print("Best Params:", random_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, random_search.predict(X_test)))
print(f"Time taken: {end_random - start_random:.2f} seconds")


🎲 RandomizedSearchCV Results:
Best Params: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 191}
Test Accuracy: 0.9649122807017544
Time taken: 0.70 seconds


## Conclusion (based on the code):
**Use GridSearchCV when:**

You want high accuracy and reliable tuning.
Your hyperparameter space is not too large (like in our code).
You want a deterministic and reproducible search.
You have enough time/resources for an exhaustive search.

✅ In our example, GridSearchCV gave us more control, ensured no combination was missed, and performed just as good or better than randomized search — at the cost of more computation.

Let me know if you want a summary table or visualization of the parameter vs. accuracy results from this run!