**Install dependencies**

In [1]:
# Installing the required packages
!pip install -q scikit-learn pandas xgboost joblib
print("Dependencies installed")


Dependencies installed.


**Uploading dataset**

In [2]:
# Uploading the dataset
from pathlib import Path
file_candidates = list(Path.cwd().glob("*breast*.*")) + list(Path.cwd().glob("*.data")) + list(Path.cwd().glob("*.csv"))
if file_candidates:
    print("Found candidate data files in working directory:")
    for f in file_candidates:
        print(" -", f.name)
else:
    try:
        from google.colab import files
        print("No dataset found. Please upload your dataset file (e.g., 'breast-cancer-wisconsin.data' or 'Breast_Cancer_Data.csv').")
        uploaded = files.upload()
        for fn in uploaded:
            print("Uploaded:", fn)
    except Exception:
        raise RuntimeError("No dataset found. Please upload the dataset file to the Colab environment.")


Found candidate data files in working directory:
 - breast-cancer-wisconsin.data
 - breast-cancer-wisconsin.data


**Creating data loader data_prep.py**

In [3]:
%%bash
cat > data_prep.py <<'PY'
"""
data_prep.py

Robust loader for the UCI "Breast Cancer Wisconsin (Original)" dataset.
- Accepts common file names or prompts upload in Colab
- Drops ID column
- Handles '?' missing values
- Maps class 2->0 (benign), 4->1 (malignant)
- Standardizes features (StandardScaler)
- Returns X_train, X_test, y_train, y_test
"""
import os
import glob
from pathlib import Path
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

DEFAULT_FILENAMES = [
    "Breast_Cancer_Data.csv",
    "breast-cancer-wisconsin.data",
    "breast-cancer-wisconsin.data.txt",
    "breast-cancer-wisconsin.data.csv",
    "breast-cancer-wisconsin.data"
]

def _find_file(provided_path=None):
    if provided_path:
        p = Path(provided_path)
        if p.exists():
            return str(p)

    for name in DEFAULT_FILENAMES:
        p = Path.cwd() / name
        if p.exists():
            return str(p)

    for p in Path.cwd().glob("*breast*.*"):
        return str(p)

    return None

def _prompt_upload():
    try:
        from google.colab import files
        print("Please upload your dataset file (e.g. 'breast-cancer-wisconsin.data').")
        uploaded = files.upload()
        for fn in uploaded:
            return fn
    except Exception:
        raise FileNotFoundError("Dataset not found in working directory and upload prompt unavailable.")

def load_data(path=None, test_size=0.25, random_state=42, standardize=True):
    fp = _find_file(path)
    if not fp:
        fp = _prompt_upload()
        if not fp:
            raise FileNotFoundError("Dataset file could not be located or uploaded.")

    try:
        df = pd.read_csv(fp, header=None, sep=',', engine='python')
    except Exception:
        with open(fp, 'r', encoding='utf-8', errors='ignore') as f:
            lines = [ln.rstrip('\n') for ln in f if ln.strip() and ln.strip()[0].isdigit()]
        if not lines:
            raise ValueError(f"No numeric data lines found in file {fp}. Please ensure you uploaded the correct dataset.")
        cleaned = StringIO("\n".join(lines))
        try:
            df = pd.read_csv(cleaned, header=None, sep=',', engine='python')
        except Exception:
            cleaned.seek(0)
            df = pd.read_csv(cleaned, header=None, delim_whitespace=True, engine='python')

    if df.shape[1] < 11:
        raise ValueError(f"Parsed data has {df.shape[1]} columns; expected at least 11. Check dataset format.")
    if df.shape[1] > 11:
        df = df.iloc[:, :11]

    df.columns = [
        "Sample_code_number",
        "Clump_Thickness",
        "Uniformity_Cell_Size",
        "Uniformity_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bare_Nuclei",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Class"
    ]

    df = df.drop(columns=["Sample_code_number"])

    df.replace('?', np.nan, inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')

    before = len(df)
    df = df.dropna().reset_index(drop=True)
    after = len(df)

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1].map({2: 0, 4: 1}).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, y_train.values, y_test.values, before, after

if __name__ == "__main__":
    X_train, X_test, y_train, y_test, before, after = load_data()
    print(f"Loaded data. Total rows before dropna: {before}, after dropna: {after}")
    print("Train/test shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
PY


**Creating logistic_regression.py**

In [4]:
%%bash
cat > logistic_regression.py <<'PY'
from data_prep import load_data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = LogisticRegression(max_iter=2000, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== Logistic Regression ===")
    print(f"Rows (before dropna): {before}, (after dropna): {after}")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "logistic_regression.pkl")

if __name__ == '__main__':
    run()
PY


**Creating knn.py (k = 5)**

In [6]:
%%bash
cat > knn.py <<'PY'
from data_prep import load_data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== K-Nearest Neighbors (k=5) ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "knn_k5.pkl")

if __name__ == '__main__':
    run()
PY


**Creating svm_linear.py (linear kernel)**

In [7]:
%%bash
cat > svm_linear.py <<'PY'
from data_prep import load_data
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = SVC(kernel='linear', random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== SVM (linear kernel) ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "svm_linear.pkl")

if __name__ == '__main__':
    run()
PY


**Creating svm_rbf.py (RBF kernel)**

In [8]:
%%bash
cat > svm_rbf.py <<'PY'
from data_prep import load_data
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = SVC(kernel='rbf', random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== SVM (RBF kernel) ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "svm_rbf.pkl")

if __name__ == '__main__':
    run()
PY


**Creating naive_bayes.py**

In [9]:
%%bash
cat > naive_bayes.py <<'PY'
from data_prep import load_data
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = GaussianNB()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== Gaussian Naive Bayes ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "naive_bayes.pkl")

if __name__ == '__main__':
    run()
PY


**Creating decision_tree.py**

In [10]:
%%bash
cat > decision_tree.py <<'PY'
from data_prep import load_data
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== Decision Tree ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "decision_tree.pkl")

if __name__ == '__main__':
    run()
PY


**Creating random_forest.py (n_estimators = 10)**

In [11]:
%%bash
cat > random_forest.py <<'PY'
from data_prep import load_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = RandomForestClassifier(n_estimators=10, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== Random Forest (n_estimators=10) ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "random_forest.pkl")

if __name__ == '__main__':
    run()
PY


**Creating xgboost_model.py**

In [12]:
%%bash
cat > xgboost_model.py <<'PY'
from data_prep import load_data
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

def run():
    X_train, X_test, y_train, y_test, before, after = load_data()
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("=== XGBoost ===")
    print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print("Confusion Matrix:")
    print(cm)
    joblib.dump(model, "xgboost_model.pkl")

if __name__ == '__main__':
    run()
PY


**Executing all model scripts (runs each .py and prints metrics)**

In [13]:
%%bash
echo "Running Logistic Regression..."
python3 logistic_regression.py || exit 1
echo
echo "Running KNN (k=5)..."
python3 knn.py || exit 1
echo
echo "Running SVM (linear)..."
python3 svm_linear.py || exit 1
echo
echo "Running SVM (rbf)..."
python3 svm_rbf.py || exit 1
echo
echo "Running Gaussian Naive Bayes..."
python3 naive_bayes.py || exit 1
echo
echo "Running Decision Tree..."
python3 decision_tree.py || exit 1
echo
echo "Running Random Forest..."
python3 random_forest.py || exit 1
echo
echo "Running XGBoost..."
python3 xgboost_model.py || exit 1
echo
echo "All models executed. Pickle files saved in working directory."


Running Logistic Regression...
=== Logistic Regression ===
Rows (before dropna): 699, (after dropna): 683
Accuracy: 0.9591 (95.91%)
Confusion Matrix:
[[106   5]
 [  2  58]]

Running KNN (k=5)...
=== K-Nearest Neighbors (k=5) ===
Accuracy: 0.9532 (95.32%)
Confusion Matrix:
[[106   5]
 [  3  57]]

Running SVM (linear)...
=== SVM (linear kernel) ===
Accuracy: 0.9591 (95.91%)
Confusion Matrix:
[[106   5]
 [  2  58]]

Running SVM (rbf)...
=== SVM (RBF kernel) ===
Accuracy: 0.9649 (96.49%)
Confusion Matrix:
[[106   5]
 [  1  59]]

Running Gaussian Naive Bayes...
=== Gaussian Naive Bayes ===
Accuracy: 0.9591 (95.91%)
Confusion Matrix:
[[106   5]
 [  2  58]]

Running Decision Tree...
=== Decision Tree ===
Accuracy: 0.9591 (95.91%)
Confusion Matrix:
[[105   6]
 [  1  59]]

Running Random Forest...
=== Random Forest (n_estimators=10) ===
Accuracy: 0.9591 (95.91%)
Confusion Matrix:
[[106   5]
 [  2  58]]

Running XGBoost...
=== XGBoost ===
Accuracy: 0.9708 (97.08%)
Confusion Matrix:
[[106   5]
 [

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


** Creating results_summary.csv with accuracy & confusion matrix for each model**

In [14]:
import pandas as pd
from data_prep import load_data
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

X_train, X_test, y_train, y_test, _, _ = load_data()

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, random_state=42),
    "KNN_k5": KNeighborsClassifier(n_neighbors=5),
    "SVM_linear": SVC(kernel='linear', random_state=42),
    "SVM_rbf": SVC(kernel='rbf', random_state=42),
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest_n10": RandomForestClassifier(n_estimators=10, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

rows = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    rows.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "Accuracy_pct": round(acc*100, 2),
        "Confusion_Matrix": str(cm.tolist())
    })

df_results = pd.DataFrame(rows)
df_results.to_csv("results_summary.csv", index=False)
print("Created results_summary.csv. Preview:")
print(df_results)


Created results_summary.csv. Preview:
                Model  Accuracy  Accuracy_pct     Confusion_Matrix
0  LogisticRegression    0.9591         95.91  [[106, 5], [2, 58]]
1              KNN_k5    0.9532         95.32  [[106, 5], [3, 57]]
2          SVM_linear    0.9591         95.91  [[106, 5], [2, 58]]
3             SVM_rbf    0.9649         96.49  [[106, 5], [1, 59]]
4          GaussianNB    0.9591         95.91  [[106, 5], [2, 58]]
5        DecisionTree    0.9591         95.91  [[105, 6], [1, 59]]
6    RandomForest_n10    0.9591         95.91  [[106, 5], [2, 58]]
7             XGBoost    0.9708         97.08  [[106, 5], [0, 60]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
