In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [2]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define a parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],  # Changed 'auto' to 'sqrt'
    'max_depth': [4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}

# Setup GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

Best parameters found:  {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}


In [4]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
specificity = recall_score(y_test, y_pred, pos_label=0)

# Print performance metrics
print("Accuracy: ", accuracy)
print("Recall: ", recall)
print("Precision: ", precision)
print("Specificity: ", specificity)

Accuracy:  0.9707602339181286
Recall:  0.9907407407407407
Precision:  0.963963963963964
Specificity:  0.9365079365079365
