# 🧠 Breast Cancer Diagnosis Classifier
This notebook uses SelectKBest for feature selection and Logistic Regression to classify tumors as benign or malignant.

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

In [None]:
# Step 2: Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [None]:
# Step 3: Separate features and target
X = df.drop('target', axis=1)
y = df['target']

In [None]:
# Step 4: Select top 10 features using SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)
mask = selector.get_support()
selected_features = X.columns[mask]
print("Top 10 Selected Features:")
print(selected_features.tolist())

In [None]:
# Step 5: Split the selected data
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: Train Logistic Regression
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [None]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Step 8: ROC Curve
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}", color='blue')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Step 9: Feature Importance
importance = pd.Series(model.coef_[0], index=selected_features)
importance.sort_values().plot(kind='barh', title='Feature Importance (Top 10 Features)')
plt.xlabel("Coefficient Value")
plt.show()