1. Loading and Preprocessing (2 marks)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data,columns=data.feature_names)
y = pd.Series(data.target)

In [3]:
# Check any missing values
print(X.isnull().sum())
print("Total missing value: ",X.isnull().sum().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64
Total missing value:  0


In [4]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Spliting dataset for train and test
X_train,X_test,y_train,y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

2. Classification Algorithm Implementation (5 marks)

In [6]:
# feature selection
from sklearn.feature_selection import SelectKBest, f_classif
feature_names = data.feature_names

def perfom_feature_selection(X_train,X_test,y_train,k=10):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    selected_feature_mask = selector.get_support()
    selected_features = feature_names[selected_feature_mask]
    
    feature_scores = pd.DataFrame({
        'Feature' : feature_names,
        'Score' : selector.scores_
        })
    feature_scores = feature_scores.sort_values('Score', ascending =False)
    return X_train_selected,X_test_selected,selected_features, feature_scores

In [7]:
# Perform feature selection
k_features = 10
X_train_selected,X_test_selected,selected_features,feature_scores = perfom_feature_selection(X_train,X_test,y_train, k =k_features)

In [8]:
# printing selected features
print("\nTop 10 Selected Features:")
print("-" * 30)
for i, feature in enumerate(selected_features, 1):
    score = feature_scores[feature_scores['Feature'] == feature]['Score'].values[0]
    print(f"{i}. {feature}: {score:.2f}")


Top 10 Selected Features:
------------------------------
1. mean radius: 482.23
2. mean perimeter: 522.49
3. mean area: 423.65
4. mean concavity: 396.66
5. mean concave points: 695.18
6. worst radius: 645.35
7. worst perimeter: 681.26
8. worst area: 495.79
9. worst concavity: 331.33
10. worst concave points: 746.49


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
logreg = LogisticRegression(max_iter = 10000)
logreg.fit(X_train,y_train)
logreg_pred = logreg.predict(X_test)

logreg_accuracy = accuracy_score(y_test,logreg_pred)

Logistic Regression is a statistical method for predicting binary classes. The outcome is modeled by a logistic function, making it suitable for this binary classification task.

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
dtree_pred = dtree.predict(X_test)

dtree_accuracy = accuracy_score(y_test,dtree_pred)

Decision Trees utilize a tree-like model of decisions. They are interpretable and can capture non-linear relationships without requiring scaling.

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test,rf_pred)

Random Forest is an ensemble learning method that builds multiple decision trees and merges them to get a more accurate and stable prediction.

In [18]:
from sklearn.svm import SVC

# Initialize and train the model
svm = SVC()
svm.fit(X_train,y_train)
svm_pred = svm.predict(X_test)

svm_accuracy = accuracy_score(y_test,svm_pred)

SVM constructs a hyperplane in a high-dimensional space to separate classes. It is effective for high-dimensional datasets and can model complex boundaries.

In [19]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model  
knn = KNeighborsClassifier()  
knn.fit(X_train, y_train)  
knn_pred = knn.predict(X_test)  

knn_accuracy = accuracy_score(y_test, knn_pred)  

k-NN classifies instances based on their closest training examples. It works well with small datasets and is intuitive, but may struggle with larger datasets and more complex boundaries.

3. Model Comparison (2 marks)

In [25]:
# Store accuracies in a dictionary for comparison  
accuracies = {  
    'Logistic Regression': logreg_accuracy,  
    'Decision Tree': dtree_accuracy,  
    'Random Forest': rf_accuracy,  
    'SVM': svm_accuracy,  
    'k-NN': knn_accuracy,  
}  

# Display the accuracy for each model  
for model, accuracy in accuracies.items():  
    print(f"{model}: {accuracy:.4f}")  
print("\n")
# Identify the best and worst performing algorithms  
best_model = max(accuracies, key=accuracies.get)  
worst_model = min(accuracies, key=accuracies.get)  

print(f"Best Model: {best_model} with accuracy: {accuracies[best_model]:.4f}")  
print(f"Worst Model: {worst_model} with accuracy: {accuracies[worst_model]:.4f}")  

Logistic Regression: 0.9737
Decision Tree: 0.9386
Random Forest: 0.9649
SVM: 0.9737
k-NN: 0.9474


Best Model: Logistic Regression with accuracy: 0.9737
Worst Model: Decision Tree with accuracy: 0.9386
