In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()

# Display the feature names and target names
features = data.feature_names
target = data.target_names
print("Feature Names:", features)
print("Target Names:", target)

Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Target Names: ['malignant' 'benign']


In [5]:
# Convert to DataFrame
df = pd.DataFrame(data.data,columns=features)

# Check for missing values
print(df.isnull().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

# Standardizing the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.data)

In [9]:
# Check means and standard deviations of the scaled data
print("Mean after scaling:", np.mean(data_scaled, axis=0))
print("Standard deviation after scaling:", np.std(data_scaled, axis=0))

Mean after scaling: [-3.16286735e-15 -6.53060890e-15 -7.07889127e-16 -8.79983452e-16
  6.13217737e-15 -1.12036918e-15 -4.42138027e-16  9.73249991e-16
 -1.97167024e-15 -1.45363120e-15 -9.07641468e-16 -8.85349205e-16
  1.77367396e-15 -8.29155139e-16 -7.54180940e-16 -3.92187747e-16
  7.91789988e-16 -2.73946068e-16 -3.10823423e-16 -3.36676596e-16
 -2.33322442e-15  1.76367415e-15 -1.19802625e-15  5.04966114e-16
 -5.21317026e-15 -2.17478837e-15  6.85645643e-16 -1.41265636e-16
 -2.28956670e-15  2.57517109e-15]
Standard deviation after scaling: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [None]:
#Preprocessing Steps for the Breast Cancer Dataset

1.Load the Dataset:
First we load the breast cancer dataset
2.Check for Missing Values:
In this dataset there are no missing values
3.Feature Scaling:
The features in this dataset are on different scales so we apply feature scaling.
This process adjusts the data so that each feature has a mean of 0 and a standard deviation of 1
Standardization helps the model treat all features equally and makes learning faster and more accurate.



In [23]:
# Logistic Regression
# Logistic Regression is used to predict if something belongs to one of two categories like yes or no.
# It's a simple model and works well when the relationship between the features and the outcome is linear.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_scaled, data.target, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.9736842105263158


In [13]:
#Decision Tree Classifier

#A Decision Tree is like a flowchart.
#For the breast cancer dataset it can clearly separate different tumor types based on feature values

from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)

Decision Tree Accuracy: 0.9473684210526315


In [15]:
# Random Forest Classifier
#Random Forest is like having many decision trees working together.
#It’s more accurate and reliable than a single decision tree because it reduces the risk of overfitting. It works well for datasets with complex patterns

from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.9649122807017544


In [17]:
#Support Vector Machine (SVM)
#SVM tries to draw a line or boundary that best divides the data into two groups.If the data is complicated, SVM can also use curves to separate the groups.
#It's great when the data can be divided into two groups clearly.It works well with high-dimensional data. which is the case with the breast cancer dataset.


from sklearn.svm import SVC

# Initialize and train the SVM model
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.956140350877193


In [19]:
# k-Nearest Neighbors (k-NN)
#k-NN looks at the 'k' closest points to a new point and predicts the majority class among them. 
#For breast cancer similar cell features likely belong to the same class so this method can be effective.


from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the k-NN model
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn_classifier.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("k-NN Accuracy:", accuracy_knn)

k-NN Accuracy: 0.9473684210526315


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_scaled, data.target, test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5)
}

# Dictionary to store accuracy of each model
accuracy_scores = {}

# Train each model, make predictions, and calculate accuracy
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.4f}")

# Identify the best and worst performing models
best_model = max(accuracy_scores, key=accuracy_scores.get)
worst_model = min(accuracy_scores, key=accuracy_scores.get)

print(f"\nBest Performing Model: {best_model} with Accuracy: {accuracy_scores[best_model]:.4f}")
print(f"Worst Performing Model: {worst_model} with Accuracy: {accuracy_scores[worst_model]:.4f}")

Logistic Regression Accuracy: 0.9737
Decision Tree Accuracy: 0.9474
Random Forest Accuracy: 0.9649
SVM Accuracy: 0.9561
k-NN Accuracy: 0.9474

Best Performing Model: Logistic Regression with Accuracy: 0.9737
Worst Performing Model: Decision Tree with Accuracy: 0.9474
