In [29]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [1]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()

In [12]:
# Display dataset information
print("Feature Names:", cancer.feature_names)
print("Target Names:", cancer.target_names)
print("Size of features:",cancer.data.size)

Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Target Names: ['malignant' 'benign']
Size of features: 17070


In [6]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
X=cancer.data
y=cancer.target
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y) # 80% training and 20% test

In [8]:
from sklearn.preprocessing import StandardScaler
# Normalize the dataset for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
# Define Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

In [22]:
# Hyperparameter Tuning with GridSearchCV
parameters = {
    'n_estimators': [50, 100, 200],'learning_rate': [0.01, 0.1, 0.2],'max_depth': [3, 5, 10],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]
}
gridSearchCv = GridSearchCV(gb, parameters, cv=5, scoring='accuracy',n_jobs=-1)
gridSearchCv.fit(X_train, y_train)

# Get best model
best_gbm = gridSearchCv.best_estimator_

print("Best Hyperparameters:", gridSearchCv.best_params_)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [24]:
# k-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_gbm, X_train, y_train, cv=kfold, scoring='accuracy')

print("Cross-Validation Accuracy: {:.4f}".format(np.mean(cv_scores)))

Cross-Validation Accuracy: 0.9692


In [25]:
# Train the Best Model and Make Predictions
best_gbm.fit(X_train, y_train)
y_pred = best_gbm.predict(X_test)

In [32]:
# Model Evaluation
acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Precision
prec = precision_score(y_test, y_pred)

# Recall
recall = recall_score(y_test, y_pred)

# F1-score
f1 = f1_score(y_test, y_pred)

# Confusion matrix values (TN, FP, FN, TP)
TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()

# Sensitivity
sensi = TP / (TP + FN)  

# Specificity
speci = TN / (TN + FP)

# Print all metrics
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Sensitivity: {sensi:.4f}")
print(f"Specificity : {speci:.4f}")

Confusion Matrix:
 [[38  4]
 [ 1 71]]
Accuracy: 0.9561
Precision: 0.9467
Recall (Sensitivity): 0.9861
F1-score: 0.9660
Sensitivity: 0.9861
Specificity : 0.9048
