In [None]:
#Loading and preprocessing
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 1. Load the dataset
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target # 0 = Malignant, 1 = Benign

# 2. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 3. Perform Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Data loaded and preprocessed. Training shape: {X_train_scaled.shape}, Testing shape: {X_test_scaled.shape}")


Data loaded and preprocessed. Training shape: (398, 30), Testing shape: (171, 30)


In [None]:
#Classification and Algorithm Implementation
# Function to train and evaluate models
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    print(f"--- Training {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}\n")
    # print(classification_report(y_test, y_pred)) # Optional: for detailed metrics
    return accuracy, name

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "SVM (RBF Kernel)": SVC(random_state=42),
    "k-NN (k=5)": KNeighborsClassifier(n_neighbors=5)
}

results = []
# Iterate through models, train, and evaluate
for name, model in models.items():
    # Note: Logistic Regression, SVM, and k-NN use the scaled data
    # Decision Tree and Random Forest can use either, but scaled is fine.
    acc, model_name = evaluate_model(name, model, X_train_scaled, y_train, X_test_scaled, y_test)
    results.append({'Algorithm': model_name, 'Accuracy': acc})


--- Training Logistic Regression ---
Accuracy: 0.9883

--- Training Decision Tree ---
Accuracy: 0.9181

--- Training Random Forest ---
Accuracy: 0.9357

--- Training SVM (RBF Kernel) ---
Accuracy: 0.9766

--- Training k-NN (k=5) ---
Accuracy: 0.9591



In [None]:
#Model comparison
# Convert results to a DataFrame for easy comparison
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

print("\n--- Model Performance Comparison (Accuracy Score) ---")
print(results_df.to_markdown(index=False))

# Determine the best and worst performers
best_model = results_df.iloc[0]
worst_model = results_df.iloc[-1]

print(f"\nBest Performing Algorithm: {best_model['Algorithm']} with Accuracy: {best_model['Accuracy']:.4f}")
print(f"Worst Performing Algorithm: {worst_model['Algorithm']} with Accuracy: {worst_model['Accuracy']:.4f}")



--- Model Performance Comparison (Accuracy Score) ---
| Algorithm           |   Accuracy |
|:--------------------|-----------:|
| Logistic Regression |   0.988304 |
| SVM (RBF Kernel)    |   0.976608 |
| k-NN (k=5)          |   0.959064 |
| Random Forest       |   0.935673 |
| Decision Tree       |   0.918129 |

Best Performing Algorithm: Logistic Regression with Accuracy: 0.9883
Worst Performing Algorithm: Decision Tree with Accuracy: 0.9181
