In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('breast+cancer+wisconsin+diagnostic/wdbc.txt', header = None, engine = "python", sep = r"\s*,\s*")
columns = ["id", "diagnosis",'radius_mean','texture_mean','perimeter_mean', 'area_mean', 'smoothness_mean','compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean','radius_se','texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
  'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst',
  'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst',
   'fractal_dimension_worst']
df.columns = columns
df.to_csv('breast_cancer_data.csv',index=False)
df.drop("id",axis=1, inplace=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#data processing.
data = df.copy()
data["diagnosis"] = data["diagnosis"].map({'B': 0, 'M': 1})

#splitting features from target variable.
X = data.drop("diagnosis", axis=1)
Y = data["diagnosis"]

#making a transformer for scaling all numerical features.
numerical_cols = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), numerical_cols)])

#partitions for train test split.
partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20")
]

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np

#code for cv, training and testing starts here.
results = []
for training_size, testing_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"Partition: {partition_name} (Train/Test)")
    print(f"{'='*50}")

    current_partition_results = []
    #doing three trials by looping 3 times.
    for trial_number in range(3):
        print(f"Trial number: {trial_number+1}")

        #splitting data into training and testing and creating a pipeline for preprocessing and classification.
        X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, test_size = testing_size, random_state = 42+trial_number,  stratify = Y)
        xg_boost_pipeline = Pipeline([
            ('Preprocessor', preprocessor),
            ('classifier', XGBClassifier(random_state = 42+trial_number, eval_metric = 'logloss'))
        ])

        #hyperparameters
        parameter_grid = {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [3, 4, 5],
            'classifier__learning_rate': [0.1, 0.2],
            'classifier__subsample': [0.8, 1.0],
        }
        #Tuning hyperparameters to find best model
        grid_search = GridSearchCV(xg_boost_pipeline, parameter_grid, n_jobs=-1, cv=5, scoring='accuracy', verbose=0)
        grid_search.fit(X_training, Y_training)
        optimum_model = grid_search.best_estimator_

        #getting label predictions from the best model.
        training_prediction = optimum_model.predict(X_training)
        testing_prediction = optimum_model.predict(X_testing)

        #calculating accuracies
        training_accuracy = accuracy_score(Y_training, training_prediction)
        testing_accuracy = accuracy_score(Y_testing, testing_prediction)
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"training_Accuracy: {training_accuracy:.4f}")
        print(f"testing_accuracy: {testing_accuracy:.4f}")
        print(f"cv_accuracy: {grid_search.best_score_:.4f}")

        trial_result = {
            "Partition name": partition_name,
            "Trial number": trial_number+1,
            "Training accuracy": training_accuracy,
            "Testing accuracy": testing_accuracy,
            "Cross validation accuracy":grid_search.best_score_,
            "Best parameters":grid_search.best_params_
        }
        current_partition_results.append(trial_result)

    #calculating average accuracy for training, cross validation and testing.
    avg_training_accuracy = np.mean([result["Training accuracy"] for result in current_partition_results])
    avg_testing_accuracy = np.mean([result["Testing accuracy"] for result in current_partition_results])
    avg_cv_accuracy = np.mean([result["Cross validation accuracy"] for result in current_partition_results])

    average_accuracy_summary = {
        "Partition name": partition_name,
        "Partition results": current_partition_results,
        "Average training accuracy": avg_training_accuracy,
        "Average testing accuracy": avg_testing_accuracy,
        "Average cross validation accuracy": avg_cv_accuracy,
    }
    results.append(average_accuracy_summary)


Partition: 20/80 (Train/Test)
Trial number: 1
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
training_Accuracy: 1.0000
testing_accuracy: 0.9759
cv_accuracy: 0.9126
Trial number: 2
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
training_Accuracy: 1.0000
testing_accuracy: 0.9364
cv_accuracy: 0.9206
Trial number: 3
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
training_Accuracy: 1.0000
testing_accuracy: 0.9452
cv_accuracy: 0.9379

Partition: 50/50 (Train/Test)
Trial number: 1
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
training_Accuracy: 1.0000
testing_accuracy: 0.9754
cv_accuracy: 0.9612
Trial number: 2
Be

In [9]:
#printing final results and making a csv data table containing the results.
print(f"\n{'='*60}")
print("Final summary - XGBoost on Breast Cancer Dataset")
print(f"{'='*60}")

for result in results:
    print(f"Partition: {result['Partition name']}")
    print(f"  Avg Train Accuracy: {result["Average training accuracy"]:.4f}")
    print(f"  Avg CV Accuracy: {result['Average cross validation accuracy']:.4f}")
    print(f"  Avg Test Accuracy: {result['Average testing accuracy']:.4f}")
    print()


compiled_results = []
for result in results:
    for partition in result["Partition results"]:
        compiled_results.append(partition)

xgboost_results_csv = pd.DataFrame(compiled_results)
xgboost_results_csv.to_csv("xgboost_results.csv", index=False)
print("results saved to xgboost_results.csv successfully!")


Final summary - XGBoost on Breast Cancer Dataset
Partition: 20/80
  Avg Train Accuracy: 1.0000
  Avg CV Accuracy: 0.9237
  Avg Test Accuracy: 0.9525

Partition: 50/50
  Avg Train Accuracy: 1.0000
  Avg CV Accuracy: 0.9613
  Avg Test Accuracy: 0.9708

Partition: 80/20
  Avg Train Accuracy: 1.0000
  Avg CV Accuracy: 0.9692
  Avg Test Accuracy: 0.9649

results saved to xgboost_results.csv successfully!
