In [106]:
cancer = load_breast_cancer()
X, y, labels, features = cancer.data, cancer.target, cancer.target_names, cancer.feature_names
print('labels:', labels)
print('features:', features)

labels: ['malignant' 'benign']
features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


CV scores, Mean CV,  Std CV accuracy, Classification Report, Train accuracy, Test accuracy

## DecisionTree

In [143]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Split into training and test samples
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Define hyperparameters for GridSearchCV
param_grid = {
    'max_depth': np.arange(1, 11),
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=20),
    param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)

grid_search.fit(X_train, y_train)

# The best parameters
best_params = grid_search.best_params_
print("\nBest parameters found:", best_params)

# Best model
clf = DecisionTreeClassifier(
    criterion=best_params['criterion'],
    random_state=20,
    max_depth=int(best_params['max_depth'])
).fit(X_train, y_train)

# Train / Test accuracy
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)
print(f"\nTrain accuracy = {train_accuracy:.3%}")
print(f"Test accuracy  = {test_accuracy:.3%}")

# Cross-validation score
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {np.mean(cv_scores):.3%}")
print(f"Std CV accuracy: {np.std(cv_scores):.3%}")

# Classification report
y_test_pred = clf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)

# Visualisation of decision tree
graph_viz = tree.export_graphviz(clf, out_file=None,
                                 feature_names=features,
                                 class_names=labels,
                                 filled=True,
                                 rounded=False,
                                 special_characters=True)

graph = graphviz.Source(graph_viz)
graph.view(cleanup=True)




Best parameters found: {'criterion': 'entropy', 'max_depth': np.int64(3)}

Train accuracy = 97.653%
Test accuracy  = 95.105%

Cross-validation scores: [0.95348837 0.90588235 0.92941176 0.91764706 0.90588235]
Mean CV accuracy: 92.246%
Std CV accuracy: 1.780%

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94        53
           1       0.99      0.93      0.96        90

    accuracy                           0.95       143
   macro avg       0.94      0.96      0.95       143
weighted avg       0.95      0.95      0.95       143

Confusion Matrix:
 [[52  1]
 [ 6 84]]


'Source.gv.pdf'

## RandomForest

In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Split into training and test samples
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Model parameters
clf = RandomForestClassifier(
    n_estimators=150,
    max_depth=3,
    min_samples_split=5,
    min_samples_leaf=3,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    criterion='entropy'
)

# Model training
clf.fit(X_train, y_train)

# Model evaluation
train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc = accuracy_score(y_test, clf.predict(X_test))
print(f"Train accuracy = {train_acc:.3%}")
print(f"Test accuracy = {test_acc:.3%}")

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("CV scores:", cv_scores)
print(f"Mean CV accuracy: {cv_scores.mean():.3%}")
print(f"Std CV accuracy: {cv_scores.std():.3%}")

# Classification report
y_test_pred = clf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)


Train accuracy = 98.122%
Test accuracy = 97.203%
CV scores: [0.94186047 0.95294118 0.92941176 0.95294118 0.96470588]
Mean CV accuracy: 94.837%
Std CV accuracy: 1.192%

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        53
           1       0.98      0.98      0.98        90

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

Confusion Matrix:
 [[51  2]
 [ 2 88]]


## Gradient Boosting Decision Trees (GBDT)

In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Split into training and test samples
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create and train a gradient boosting model with the specified parameters
clf = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=75,
    max_depth=2,
    subsample=0.8,
    random_state=20
)
clf.fit(X_train, y_train)

# Model evaluation
print("train accuracy= {:.3%}".format(clf.score(X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score(X_test, y_test)))

cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("CV scores:", cv_scores)
print(f"Mean CV accuracy: {cv_scores.mean():.3%}")
print(f"Std CV accuracy: {cv_scores.std():.3%}")

# Classification report
y_test_pred = clf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)

train accuracy= 99.296%
test accuracy= 97.203%
CV scores: [0.93023256 0.95294118 0.94117647 0.94117647 0.95294118]
Mean CV accuracy: 94.369%
Std CV accuracy: 0.854%

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96        53
           1       0.99      0.97      0.98        90

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

Confusion Matrix:
 [[52  1]
 [ 3 87]]


## XGBoost

In [119]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

# Load the data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20, stratify=y
)

# Define hyperparameters for RandomizedSearchCV
param_distributions = {
    'learning_rate': [0.005, 0.01, 0.02],
    'n_estimators': [500, 600, 700],
    'max_depth': [2, 3],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'gamma': [0.1, 0.3],
    'reg_alpha': [0.5, 0.7, 1],
    'reg_lambda': [2, 3, 4],
    'min_child_weight': [3, 5],
    'max_delta_step': [1]  # Stabilization for class imbalance
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=20,
        verbosity=0
    ),
    param_distributions=param_distributions,
    n_iter=75,
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=0
)

random_search.fit(X_train, y_train)

# Best parameters
best_params = random_search.best_params_
print("Best parameters found:", best_params)

# Best model
best_model = random_search.best_estimator_

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {np.mean(cv_scores):.3%}")
print(f"Std CV accuracy: {np.std(cv_scores):.3%}")

# Train/Test accuracy
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)
print(f"\nTrain accuracy = {train_accuracy:.3%}")
print(f"Test accuracy  = {test_accuracy:.3%}")

# Classification report
y_test_pred = best_model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)

Best parameters found: {'subsample': 0.7, 'reg_lambda': 3, 'reg_alpha': 1, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 3, 'max_delta_step': 1, 'learning_rate': 0.02, 'gamma': 0.1, 'colsample_bytree': 0.6}
Cross-validation scores: [0.95604396 0.97802198 0.96703297 0.95604396 0.97802198]
Mean CV accuracy: 96.703%
Std CV accuracy: 0.983%

Train accuracy = 99.341%
Test accuracy  = 96.491%

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95        42
           1       0.97      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

Confusion Matrix:
 [[40  2]
 [ 2 70]]


In [151]:
import pandas as pd

# Data based on model results
data = {
    'Model': ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'XGBoost'],
    'Train Accuracy (%)': [97.653, 98.122, 99.531, 99.341],
    'Test Accuracy (%)': [95.105, 97.203, 97.902, 96.491],
    'Mean CV Accuracy (%)': [92.246, 94.837, 94.605, 96.703],
    'Std CV Accuracy (%)': [1.780, 1.192, 0.913, 0.983],
    'Precision Class 0': [0.90, 0.96, 0.98, 0.95],
    'Recall Class 0': [0.98, 0.96, 0.96, 0.95],
    'Precision Class 1': [0.99, 0.98, 0.98, 0.97],
    'Recall Class 1': [0.93, 0.98, 0.99, 0.97],
    'Errors (Confusion Matrix)': [7, 4, 3, 4],
    'Rank': ['4th Place: Decision Tree (Simple baseline)',
             '3rd Place: Random Forest (Good stability)',
             '2nd Place: Gradient Boosting (Highest accuracy but slight overfitting)',
             '1st Place: XGBoost (Best balance)']
}

# Creating a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Model,Train Accuracy (%),Test Accuracy (%),Mean CV Accuracy (%),Std CV Accuracy (%),Precision Class 0,Recall Class 0,Precision Class 1,Recall Class 1,Errors (Confusion Matrix),Rank
0,Decision Tree,97.653,95.105,92.246,1.78,0.9,0.98,0.99,0.93,7,4th Place: Decision Tree (Simple baseline)
1,Random Forest,98.122,97.203,94.837,1.192,0.96,0.96,0.98,0.98,4,3rd Place: Random Forest (Good stability)
2,Gradient Boosting,99.531,97.902,94.605,0.913,0.98,0.96,0.98,0.99,3,2nd Place: Gradient Boosting (Highest accuracy...
3,XGBoost,99.341,96.491,96.703,0.983,0.95,0.95,0.97,0.97,4,1st Place: XGBoost (Best balance)
