In [25]:
import pandas as pd

df = pd.read_csv("breast_cancer_data.csv", index_col=False)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

df.drop("id", axis = 1, inplace = True) #uneccessary for classification.
df["diagnosis"] = df["diagnosis"].map({'M': 1, 'B' : 0}) #one hot encoding the target variable.

#splitting features columns from the target variable column.
X = df.drop("diagnosis", axis =1)
Y = df["diagnosis"]

#making a transformer that scales all numerical values.
numerical_cols = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers = [("num", StandardScaler(), numerical_cols)])

partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20"),
]

KeyError: "['id'] not found in axis"

In [3]:
#starting code for random forest classification.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np
import warnings
warnings.filterwarnings("ignore")

results=[]
for training_size, testing_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"Partition: {partition_name} (Train/Test)")
    print(f"\n{'='*50}")

    current_partition_results = []
    for trial in range(3):
        print(f"Trial: {trial+1}")

        #splitting training and testing data and building a pipeline for processing data and classification.
        X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, stratify = Y, test_size=testing_size, random_state=42+trial)
        random_forest_pipeline = Pipeline([("preprocessor", preprocessor),
                                          ("classifier", RandomForestClassifier(random_state=42 + trial))])

        #Hyperparameters of random forest classifier.
        parameter_grid = {
            'classifier__n_estimators': [50, 100, 200],  # Number of trees
            'classifier__max_depth': [None, 5, 10, 15, 25, 30],  # Maximum depth of trees
            'classifier__min_samples_split': [2, 5, 10],  # Minimum samples to split a node
            'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples at a leaf node
        }

        #tuning hyperparameters
        grid_search = GridSearchCV(random_forest_pipeline, parameter_grid, n_jobs = -1, cv = 5, verbose = 0,scoring="accuracy")
        grid_search.fit(X_training, Y_training)
        optimum_model = grid_search.best_estimator_

        #getting predictions from the best model with optimum hyperparameters.
        training_prediction = optimum_model.predict(X_training)
        testing_prediction = optimum_model.predict(X_testing)

        #calculating accuracies for training, cross validation and testing.
        training_accuracy = accuracy_score(Y_training, training_prediction)
        testing_accuracy = accuracy_score(Y_testing, testing_prediction)
        print(f"Best Hyperparameters: {grid_search.best_params_}")
        print(f"Training Accuracy: {training_accuracy:.4f}")
        print(f"Testing Accuracy: {testing_accuracy:.4f}")
        print(f"Cross validation accuracy: {grid_search.best_score_:.4f}")

        trial_results = {
            "Partition name": partition_name,
            "Trial number": trial+1,
            "Training accuracy": training_accuracy,
            "Testing accuracy": testing_accuracy,
            "Cross validation accuracy":grid_search.best_score_,
            "Best parameters":grid_search.best_params_
        }
        current_partition_results.append(trial_results)

    #calculating average accuracies
    avg_training_accuracy = np.mean([r["Training accuracy"] for r in current_partition_results])
    avg_testing_accuracy = np.mean([r["Testing accuracy"] for r in current_partition_results])
    avg_cross_validation_accuracy = np.mean([r["Cross validation accuracy"] for r in current_partition_results])

    avg_accuracy_summary = {
        "Partition name": partition_name,
        "Partition results": current_partition_results,
        "Average training accuracy": avg_training_accuracy,
        "Average testing accuracy": avg_testing_accuracy,
        "Average cross validation accuracy": avg_cross_validation_accuracy,
    }
    results.append(avg_accuracy_summary)


Partition: 20/80 (Train/Test)

Trial: 1
Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Training Accuracy: 1.0000
Testing Accuracy: 0.9605
Cross validation accuracy: 0.9123
Trial: 2
Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Training Accuracy: 1.0000
Testing Accuracy: 0.9364
Cross validation accuracy: 0.9826
Trial: 3
Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Training Accuracy: 0.9912
Testing Accuracy: 0.9386
Cross validation accuracy: 0.9202

Partition: 50/50 (Train/Test)

Trial: 1
Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
Training Accura

In [4]:
#printing results of rf classifier and storing them into a csv file.
print(f"\n{'='*60}")
print("Final summary - random forest on Breast Cancer Dataset")
print(f"{'='*60}")

for result in results:
    print(f"Partition: {result['Partition name']}")
    print(f"  Avg Train Accuracy: {result["Average training accuracy"]:.4f}")
    print(f"  Avg CV Accuracy: {result['Average cross validation accuracy']:.4f}")
    print(f"  Avg Test Accuracy: {result['Average testing accuracy']:.4f}")
    print()

compiled_results = []
for result in results:
    for partition in result["Partition results"]:
        compiled_results.append(partition)

random_forest_results_csv = pd.DataFrame(compiled_results)
random_forest_results_csv.to_csv("random_forest_results.csv", index=False)
print("results saved to random_forest_results.csv successfully!")


Final summary - random forest on Breast Cancer Dataset
Partition: 20/80
  Avg Train Accuracy: 0.9971
  Avg CV Accuracy: 0.9383
  Avg Test Accuracy: 0.9452

Partition: 50/50
  Avg Train Accuracy: 0.9977
  Avg CV Accuracy: 0.9601
  Avg Test Accuracy: 0.9591

Partition: 80/20
  Avg Train Accuracy: 0.9985
  Avg CV Accuracy: 0.9648
  Avg Test Accuracy: 0.9561

results saved to random_forest_results.csv successfully!
