In [5]:
# Breast Cancer Classification Assignment



# 1️⃣ Load and Preprocess the Breast Cancer Dataset

from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# Check for missing values
print("Missing values:\n", X.isnull().sum().sum())

# Feature scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 🔍 Preprocessing Explanation:
# - Checked for missing values (none found)
# - Standardized features using StandardScaler to ensure all values are on the same scale

# 2️⃣ Classification Algorithms

# Initialize results dictionary
results = {}

# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
results['Logistic Regression'] = accuracy_score(y_test, y_pred_lr)

# 2. Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
results['Decision Tree'] = accuracy_score(y_test, y_pred_dt)

# 3. Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['Random Forest'] = accuracy_score(y_test, y_pred_rf)

# 4. Support Vector Machine
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
results['SVM'] = accuracy_score(y_test, y_pred_svm)

# 5. k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
results['k-NN'] = accuracy_score(y_test, y_pred_knn)

# 3️⃣ Model Comparison

# Print model accuracies
for model, acc in results.items():
    print(f"{model}: Accuracy = {acc:.4f}")

# Identify best and worst performing models
best_model = max(results, key=results.get)
worst_model = min(results, key=results.get)

print(f"\n✅ Best Performing Model: {best_model} ({results[best_model]:.4f})")
print(f"❌ Worst Performing Model: {worst_model} ({results[worst_model]:.4f})")

# 4️⃣ Summary


print("\nClassification Report (Best Model):")
if best_model == "Logistic Regression":
    print(classification_report(y_test, y_pred_lr))
elif best_model == "Decision Tree":
    print(classification_report(y_test, y_pred_dt))
elif best_model == "Random Forest":
    print(classification_report(y_test, y_pred_rf))
elif best_model == "SVM":
    print(classification_report(y_test, y_pred_svm))
else:
    print(classification_report(y_test, y_pred_knn))


Missing values:
 0
Logistic Regression: Accuracy = 0.9737
Decision Tree: Accuracy = 0.9474
Random Forest: Accuracy = 0.9649
SVM: Accuracy = 0.9737
k-NN: Accuracy = 0.9474

✅ Best Performing Model: Logistic Regression (0.9737)
❌ Worst Performing Model: Decision Tree (0.9474)

Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

