In [1]:
# === Import Libraries ===
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from google.colab import files
uploaded = files.upload()

# Load the Dataset
file_name = 'Cancer_Data.xlsx'
df = pd.read_excel(file_name)

# Encode diagnosis (M = 1, B = 0)
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === Classifiers Dictionary ===
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss'),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP Classifier": MLPClassifier(max_iter=1000)
}

# === Training and Evaluation ===
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    results.append({
        'Model': name,
        'Train Accuracy': accuracy_score(y_train, train_preds),
        'Test Accuracy': accuracy_score(y_test, test_preds),
        'Precision': classification_report(y_test, test_preds, output_dict=True)['1']['precision'],
        'Recall': classification_report(y_test, test_preds, output_dict=True)['1']['recall'],
        'F1 Score': classification_report(y_test, test_preds, output_dict=True)['1']['f1-score']
    })

results_df = pd.DataFrame(results)
print(results_df)


Saving Cancer_Data.xlsx to Cancer_Data.xlsx
                 Model  Train Accuracy  Test Accuracy  Precision    Recall  \
0  Logistic Regression        0.986813       0.973684   0.976190  0.953488   
1                  SVM        0.989011       0.982456   1.000000  0.953488   
2        Decision Tree        1.000000       0.938596   0.909091  0.930233   
3        Random Forest        1.000000       0.964912   0.975610  0.930233   
4              XGBoost        1.000000       0.956140   0.952381  0.930233   
5             AdaBoost        1.000000       0.964912   0.953488  0.953488   
6          Naive Bayes        0.934066       0.964912   0.975610  0.930233   
7       MLP Classifier        1.000000       0.973684   0.976190  0.953488   

   F1 Score  
0  0.964706  
1  0.976190  
2  0.919540  
3  0.952381  
4  0.941176  
5  0.953488  
6  0.952381  
7  0.964706  


Hyperparameters Tuning using Random Forest:

In [2]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=10, cv=5, verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20}
