# Breast Cancer Classification with PCA + Logistic Regression
# -----------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



# 2. Load the data
# Define column names from .names file (add as list)
col_names = [
    "id", "diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
    "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se",
    "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

df = pd.read_csv("wdbc.data", header=None, names=col_names)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [2]:
# 3. Drop the ID column and convert diagnosis to 0/1
df = df.drop(columns=['id'])
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

In [3]:
# 4. Separate features and target
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [4]:
# 5. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# 6. Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# 7. Apply PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Original feature count: {X.shape[1]}")
print(f"Reduced feature count after PCA: {X_train_pca.shape[1]}")
print("Explained variance ratio (cumulative):", np.sum(pca.explained_variance_ratio_))

Original feature count: 30
Reduced feature count after PCA: 10
Explained variance ratio (cumulative): 0.9520691014391003


In [7]:
# 8. Train Logistic Regression on PCA features
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(X_train_pca, y_train)

In [8]:
# 9. Predict and evaluate
y_train_pred = clf.predict(X_train_pca)
y_test_pred = clf.predict(X_test_pca)

print("\nAccuracy on Train Set: {:.2f}%".format(accuracy_score(y_train, y_train_pred) * 100))
print("\nClassification Report Train:\n", classification_report(y_train, y_train_pred, target_names=['Benign', 'Malignant']))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

print("\nAccuracy on Test Set: {:.2f}%".format(accuracy_score(y_test, y_test_pred) * 100))
print("\nClassification Report Test:\n", classification_report(y_test, y_test_pred, target_names=['Benign', 'Malignant']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Accuracy on Train Set: 98.68%

Classification Report Train:
               precision    recall  f1-score   support

      Benign       0.98      1.00      0.99       285
   Malignant       0.99      0.97      0.98       170

    accuracy                           0.99       455
   macro avg       0.99      0.98      0.99       455
weighted avg       0.99      0.99      0.99       455


Confusion Matrix:
 [[284   1]
 [  5 165]]

Accuracy on Test Set: 97.37%

Classification Report Test:
               precision    recall  f1-score   support

      Benign       0.96      1.00      0.98        72
   Malignant       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114


Confusion Matrix:
 [[72  0]
 [ 3 39]]
