<a href="https://colab.research.google.com/github/Tanveer-heir/BreastCancer_Prediction/blob/main/Breast_Cancer_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Accuracy: 0.9649122807017544
ROC-AUC: 0.9953703703703703


In [6]:
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(
    log_reg,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("CV ROC-AUC scores:", cv_scores)
print("Mean:", np.mean(cv_scores))
print("Std Dev:", np.std(cv_scores))


CV ROC-AUC scores: [0.99377661 0.99344907 0.99801587 0.97949735 0.99765258]
Mean: 0.9924782978664407
Std Dev: 0.006761225651887172


In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_learner = DecisionTreeClassifier(max_depth=1)

ada = AdaBoostClassifier(
    estimator=base_learner,
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)


In [8]:
from sklearn.model_selection import cross_val_score
import numpy as np

ada_cv_scores = cross_val_score(
    ada,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("AdaBoost CV ROC-AUC:", ada_cv_scores)
print("Mean:", np.mean(ada_cv_scores))
print("Std Dev:", np.std(ada_cv_scores))


AdaBoost CV ROC-AUC: [0.98591549 0.98624304 0.99867725 0.97685185 0.99832327]
Mean: 0.989202181218231
Std Dev: 0.008307354479237203


In [10]:
ada_overfit = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=200,
    learning_rate=1.0,
    random_state=42
)
ada_overfit_cv_scores = cross_val_score(
    ada_overfit,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Ada Overfit Boost CV ROC-AUC:", ada_overfit_cv_scores)
print("Mean:", np.mean(ada_overfit_cv_scores))
print("Std Dev:", np.std(ada_overfit_cv_scores))

Ada Overfit Boost CV ROC-AUC: [0.9950868  0.99181133 1.         0.99239418 0.99765258]
Mean: 0.995388979007551
Std Dev: 0.003106287105752688


In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)


In [12]:
gb_cv_scores = cross_val_score(
    gb,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Gradient Boosting CV ROC-AUC:", gb_cv_scores)
print("Mean:", gb_cv_scores.mean())
print("Std Dev:", gb_cv_scores.std())


Gradient Boosting CV ROC-AUC: [0.99148379 0.98689813 0.99834656 0.9837963  0.99865862]
Mean: 0.9918366789886607
Std Dev: 0.005967896487918233


In [13]:
gb_fast = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

scores = cross_val_score(gb_fast, X, y, cv=5, scoring="roc_auc")
print(scores.mean(), scores.std())


0.9911753249904249 0.006003145675279236


In [14]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=6,
    max_iter=200,
    random_state=42
)


In [15]:
from sklearn.model_selection import cross_val_score
import numpy as np

hgb_cv_scores = cross_val_score(
    hgb,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("HistGradientBoosting CV ROC-AUC:", hgb_cv_scores)
print("Mean:", hgb_cv_scores.mean())
print("Std Dev:", hgb_cv_scores.std())


HistGradientBoosting CV ROC-AUC: [0.99181133 0.99148379 0.99900794 0.98776455 0.99765258]
Mean: 0.9935440376973295
Std Dev: 0.004180519183348017


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

log_reg = LogisticRegression(max_iter=5000)

ada_best = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=2),
    n_estimators=200,
    learning_rate=1.0,
    random_state=42
)

hgb = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=6,
    max_iter=200,
    random_state=42
)


In [23]:
voting_clf = VotingClassifier(
    estimators=[
        ("lr", log_reg),
        ("ada", ada_best),
        ("hgb", hgb)
    ],
    voting="soft"
)


In [24]:
from sklearn.model_selection import cross_val_score
import numpy as np

voting_scores = cross_val_score(
    voting_clf,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Voting Classifier CV ROC-AUC:", voting_scores)
print("Mean:", voting_scores.mean())
print("Std Dev:", voting_scores.std())


Voting Classifier CV ROC-AUC: [0.99443171 0.99574189 0.99966931 0.99074074 0.99865862]
Mean: 0.9958484542049886
Std Dev: 0.0031812967274774924


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier


In [26]:
base_estimators = [
    ("lr", LogisticRegression(max_iter=5000)),
    ("ada", AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=2),
        n_estimators=200,
        learning_rate=1.0,
        random_state=42
    )),
    ("hgb", HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=200,
        random_state=42
    ))
]


In [27]:
meta_learner = LogisticRegression(max_iter=5000)


In [28]:
stack_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_learner,
    cv=5,
    passthrough=False   # IMPORTANT: only predictions, no raw features
)


In [29]:
from sklearn.model_selection import cross_val_score
import numpy as np

stack_scores = cross_val_score(
    stack_clf,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Stacking Classifier CV ROC-AUC:", stack_scores)
print("Mean:", stack_scores.mean())
print("Std Dev:", stack_scores.std())


Stacking Classifier CV ROC-AUC: [0.99410416 0.99639699 0.99966931 0.99007937 0.99832327]
Mean: 0.9957146193266402
Std Dev: 0.0033840793298371475


In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

tree = DecisionTreeClassifier(
    max_depth=None,      # fully grown
    random_state=42
)

tree_scores = cross_val_score(
    tree,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Single Tree Mean:", tree_scores.mean())
print("Single Tree Std:", tree_scores.std())


Single Tree Mean: 0.915921432990709
Single Tree Std: 0.01967191645462964


In [31]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=None),
    n_estimators=100,
    bootstrap=True,
    random_state=42
)

bag_scores = cross_val_score(
    bagging,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

print("Bagging Mean:", bag_scores.mean())
print("Bagging Std:", bag_scores.std())


Bagging Mean: 0.9874984835802065
Bagging Std: 0.011069432506366118
