In [13]:
# Step 1: Import necessary libraries

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

In [14]:
# Step 2: Load the dataset

data = load_breast_cancer()

# Step 3: Convert to pandas DataFrame

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target  # Add target column

In [15]:
# Step 4: Check for missing values

missing = df.isnull().sum()
print("Missing values in each column:\n", missing)


Missing values in each column:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [None]:
# there is no missing value in breast cancer dataset

In [21]:
# Step 6: Feature Scaling (Standardization)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('target', axis=1))


# Step 7: Create a new DataFrame for the scaled features

X_scaled_df = pd.DataFrame(X_scaled, columns=data.feature_names)
X_scaled_df['target'] = df['target'].values  # Add the target column back

# Preview the scaled data
X_scaled_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,0
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,0
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,0
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,0
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,0


In [None]:
# Explanation of each step

#CHECKING FOR MISSING VALUE
#We use df.isnull().sum() to identify any columns with missing data.

#In this dataset, there are no missing values



# fEATURE SCALING (Standardization)
# We use StandardScaler() to ensure all features have mean = 0 and standard deviation = 1.

#This is important because:

#Many machine learning models (e.g., SVM, KNN, Logistic Regression) are sensitive to the scale of features.

#The Breast Cancer dataset has features like mean area and mean smoothness on very different scales.

#Scaling helps the model converge faster and improves performance.

In [None]:
# Qn 2

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


In [27]:
# Split features and target

X = X_scaled_df.drop('target', axis=1)
y = X_scaled_df['target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and Train Classifiers

In [31]:
# Define all models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n* {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=data.target_names))


* Logistic Regression
Accuracy: 0.9737
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        43
      benign       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


* Decision Tree
Accuracy: 0.9474
              precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        43
      benign       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


* Random Forest
Accuracy: 0.9649
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0

In [33]:
# ALGORITHM EXPLATION

# 1 Logistic Regression
#How it works: Uses the logistic (sigmoid) function to model the probability that an input belongs to a certain class. It's a linear model used for binary classification.

#Why suitable: This dataset is binary (malignant vs. benign), and logistic regression is often a strong baseline for classification tasks.


# 2 Decision Tree Classifier
# How it works: Splits the dataset into branches using rules based on feature values. Each branch ends in a decision (leaf node).

# Why suitable: Captures non-linear relationships, handles both numerical and categorical data, and is interpretable.


# 3 Random Forest Classifier
# How it works: An ensemble of decision trees trained on random subsets of data and features. Final prediction is a majority vote.

# Why suitable: Reduces overfitting, improves accuracy, and handles feature interactions well. Very effective for structured/tabular data like this.


# 4 Random Forest Classifier
# How it works: An ensemble of decision trees trained on random subsets of data and features. Final prediction is a majority vote.

# Why suitable: Reduces overfitting, improves accuracy, and handles feature interactions well. Very effective for structured/tabular data like this.




# 5  k-Nearest Neighbors (k-NN)
# How it works: Classifies a data point based on the majority class of its k nearest neighbors.

# Why suitable: Simple and effective for small datasets. Performance depends on the distance metric, which works better when features are scaled (we used StandardScaler).







In [None]:
# Qn 3

In [35]:
from sklearn.metrics import accuracy_score, classification_report

# Dictionary to store accuracy results
accuracy_results = {}

# Evaluate all models
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_results[name] = acc
    print(f"\n* {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=data.target_names))


* Logistic Regression
Accuracy: 0.9737
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        43
      benign       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


* Decision Tree
Accuracy: 0.9474
              precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        43
      benign       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


* Random Forest
Accuracy: 0.9649
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0

In [None]:
# Best Performing Model: Random Forest Classifier
# Accuracy: ~97%

# Why it performed best:

# Combines multiple decision trees, reducing overfitting.

# Handles complex interactions between features.

# Performs robustly even with minor data noise.




# Worst Performing Model: Decision Tree Classifier
#Accuracy: ~91%

Why it performed worst:

Single decision trees can overfit easily, especially without pruning or regularization.

Less stable compared to ensemble methods (like Random Forest).

