Create a Python Virtual Environment


In [52]:
# --- Imports ---

import pickle
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [54]:
# --- Load Dataset ---

dataset = fetch_ucirepo(id=15)   # Breast Cancer Wisconsin (Original)


In [55]:
print(dataset.data.features.columns.tolist())

['Clump_thickness', 'Uniformity_of_cell_size', 'Uniformity_of_cell_shape', 'Marginal_adhesion', 'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses']


In [60]:
# --- Identify Predictors and Target --


predictor_cols = [
    'Clump_thickness',
    'Uniformity_of_cell_size',
    'Uniformity_of_cell_shape',
    'Marginal_adhesion',
    'Single_epithelial_cell_size',
    'Bare_nuclei',
    'Bland_chromatin',
    'Normal_nucleoli',
    'Mitoses'
]

# --- Remap Labels: Benign=2 → 0, Malignant=4 → 1 ---

X = dataset.data.features[predictor_cols]
y = pd.Series(dataset.data.targets.values.ravel()).replace({2: 0, 4: 1}).values

print("Predictor Variables:", predictor_cols)
print("Target Variable: Class (0 = Benign, 1 = Malignant)")



Predictor Variables: ['Clump_thickness', 'Uniformity_of_cell_size', 'Uniformity_of_cell_shape', 'Marginal_adhesion', 'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses']
Target Variable: Class (0 = Benign, 1 = Malignant)


In [62]:
# --- Train/Test Split ---


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)



In [64]:
# --- Preprocessing Pipeline ---


preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])




In [66]:
# --- Candidate Models ---


candidates = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "SVC_RBF": SVC(kernel="rbf", probability=True, random_state=42)
}



In [68]:
# --- Cross-validation to Compare ---


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {}

for name, model in candidates.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("clf", model)
    ])
    cv_acc = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy")
    scores[name] = cv_acc.mean()
    print(f"{name}: {cv_acc.mean():.4f} ± {cv_acc.std():.4f}")


LogisticRegression: 0.9678 ± 0.0175
RandomForest: 0.9678 ± 0.0091
SVC_RBF: 0.9678 ± 0.0107


In [70]:
# --- Pick best model ---

best_name = max(scores, key=scores.get)
best_model = candidates[best_name]
print(f"\nBest model: {best_name} with CV accuracy {scores[best_name]:.4f}")

# --- Retrain Best pipeline on Full Dataset ---
final_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", best_model)
])
final_pipe.fit(X, y)




Best model: LogisticRegression with CV accuracy 0.9678


In [72]:
# --- Evaluate on Holdout Test set ---


test_acc = final_pipe.score(X_test, y_test)
print(f"Holdout Test Accuracy: {test_acc:.4f}")



Holdout Test Accuracy: 0.9571


In [74]:
# --- Save Pipeline as Pickle ---

with open("breast_cancer_best_model.pkl", "wb") as f:
    pickle.dump(final_pipe, f)

print(" Model retrained on full dataset and saved as breast_cancer_best_model.pkl")


 Model retrained on full dataset and saved as breast_cancer_best_model.pkl
