In [1]:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


In [2]:

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target


if X.isnull().sum().sum() > 0:
    print("Missing values detected, imputing with mean...")
    X.fillna(X.mean(), inplace=True)
else:
    print("No missing values detected.")


No missing values detected.


In [3]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (455, 30)
Shape of X_test: (114, 30)
Shape of y_train: (455,)
Shape of y_test: (114,)


In [4]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


logreg = LogisticRegression(max_iter=1000, random_state=42)


logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


Confusion Matrix:
[[41  2]
 [ 1 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Accuracy: 0.97


In [None]:
Logistic Regression is a linear model that estimates the probability of a binary outcome (e.g., mt computes the log-odds of the target variable as a linear combination of the input features.
Suitability:alignant vs. benign) using the logistic function. 
Works well for linearly separable datasets.
Fast and interpretable.
Suitable for the breast cancer dataset because it is relatively small and well-structured, and linear relationships are often sufficient for good performance.


In [6]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


dt_classifier = DecisionTreeClassifier(random_state=42)


dt_classifier.fit(X_train, y_train)


y_pred = dt_classifier.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


Confusion Matrix:
[[40  3]
 [ 3 68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Accuracy: 0.95


In [None]:
Decision Trees split the data into subsets based on feature values, aiming to maximize information gain or reduce impurity (e.g., Gini index or entropy). Each decision boundary corresponds to a feature threshold.

Captures non-linear relationships in the data.
Easy to interpret.
Suitable for the breast cancer dataset as it can handle both categorical and continuous data, though it might overfit without pruning or constraints.


In [7]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)


rf_classifier.fit(X_train, y_train)


y_pred = rf_classifier.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


Confusion Matrix:
[[40  3]
 [ 1 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


Accuracy: 0.96


In [None]:
Random Forest is an ensemble of decision trees. Each tree is trained on a random subset of the data and features, and predictions are made by majority voting. This reduces overfitting compared to a single decision tree.
Robust to overfitting.
Handles non-linear relationships and feature interactions well.
Suitable for the breast cancer dataset because it balances accuracy and generalization, even if the data is noisy or slightly imbalanced.

In [8]:

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


svm_classifier = SVC(kernel='linear', random_state=42)


svm_classifier.fit(X_train, y_train)


y_pred = svm_classifier.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


Confusion Matrix:
[[41  2]
 [ 3 68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        43
           1       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114


Accuracy: 0.96


In [None]:
SVMs find a hyperplane in a high-dimensional space that best separates the classes. It maximizes the margin between the hyperplane and the nearest data points (support vectors). With kernels, SVM can handle non-linear separations.
Suitability:
Effective in high-dimensional spaces.
Works well for datasets with clear class separability.
Suitable for the breast cancer dataset due to its structured and well-separated features, especially when using a linear or RBF kernel.

In [9]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


knn_classifier = KNeighborsClassifier(n_neighbors=5)


knn_classifier.fit(X_train, y_train)


y_pred = knn_classifier.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


Confusion Matrix:
[[40  3]
 [ 3 68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Accuracy: 0.95


In [None]:
k-NN classifies a data point based on the majority class of its nearest neighbors (using a distance metric like Euclidean distance). No training phase is required beyond storing the data.
Suitability:
Simple and intuitive.
Performs well on smaller datasets where the feature space is not too complex.
Suitable for the breast cancer dataset as the size is manageable and feature scaling ensures effective distance-based comparisons.

In [10]:

from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}


results = {}


for model_name, model in models.items():
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)  
       
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report["weighted avg"]["f1-score"]
    
    results[model_name] = {
        "Accuracy": accuracy,
        "F1-Score": f1_score,
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"]
    }

import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)


                        Accuracy  F1-Score  Precision    Recall
Logistic Regression     0.973684  0.973621   0.973719  0.973684
Decision Tree           0.947368  0.947368   0.947368  0.947368
Random Forest           0.964912  0.964738   0.965205  0.964912
Support Vector Machine  0.956140  0.956237   0.956488  0.956140
k-Nearest Neighbors     0.947368  0.947368   0.947368  0.947368


In [None]:
Best Performing Algorithm is Random Forest Classifier. 
Random Forest performed the best because it is robust to overfitting and captures non-linear relationships effectively.
Highest accuracy (96%).Best balance of precision, recall, and F1-score, making it the most effective on this dataset.


Worst Performing Algorithm is Decision Tree Classifier:
Decision Tree performed the worst due to its tendency to overfit, particularly on small datasets like this one.
Lowest accuracy (91%) and F1-score (91%).Likely overfitted or less generalizable compared to Random Forest, which benefits from ensemble methods.
