<a href="https://colab.research.google.com/github/SahilKadaskar/BreastCancerPredication/blob/main/brest_cancer_predication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:

# Breast Cancer Prediction using Hybrid Feature Selection and Random Forest
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Load clinical dataset
data = load_breast_cancer()
X_clinical = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Add synthetic demographic/lifestyle features
np.random.seed(42)
X_demo_life = pd.DataFrame({
    'age': np.random.randint(30, 80, size=X_clinical.shape[0]),
    'family_history': np.random.randint(0, 2, size=X_clinical.shape[0]),
    'menopausal_status': np.random.randint(0, 2, size=X_clinical.shape[0]),
    'alcohol_use': np.random.uniform(0, 5, size=X_clinical.shape[0]),
    'physical_activity': np.random.randint(0, 4, size=X_clinical.shape[0]),
    'BMI': np.random.uniform(18, 35, size=X_clinical.shape[0])
})

# Combine all features
X = pd.concat([X_clinical, X_demo_life], axis=1)

# Filter method: Select top 20 features using ANOVA F-test
filter_selector = SelectKBest(score_func=f_classif, k=20)
X_filtered = filter_selector.fit_transform(X, y)
selected_filter_features = X.columns[filter_selector.get_support()]

# Wrapper method: RFE with Logistic Regression to select top 10 from the filtered ones
model_lr = LogisticRegression(max_iter=1500)
rfe = RFE(model_lr, n_features_to_select=10)
X_rfe = rfe.fit_transform(X[selected_filter_features], y)
selected_rfe_features = selected_filter_features[rfe.get_support()]

# Prepare final dataset
X_final = X[selected_rfe_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
print("Selected Features:", list(selected_rfe_features))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93        63
           1       0.95      0.96      0.96       108

    accuracy                           0.95       171
   macro avg       0.94      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

ROC AUC Score: 0.9918430335097002
Selected Features: ['mean radius', 'mean concavity', 'mean concave points', 'perimeter error', 'worst radius', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry']
