In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Step 1: Load the dataset
df = pd.read_csv("breast_cancer_risk_data.csv")

# Step 2: Encode categorical variables
df_encoded = df.copy()
label_encoders = {}

for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# Optionally drop the 'Race/Ethnicity' column if present
if "Race/Ethnicity" in df_encoded.columns:
    df_encoded = df_encoded.drop(columns=["Race/Ethnicity"])

# Step 3: Split features and target
X_full = df_encoded.drop(columns=["Cancer"])
y = df_encoded["Cancer"]

# Train/test split using all features first
X_train_full, X_test_full, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Get top 5 features using Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_full, y_train)

# Get top 5 features
feature_importances = pd.Series(rf.feature_importances_, index=X_full.columns)
top_5_features = feature_importances.sort_values(ascending=False).head(5).index.tolist()

print("Top 5 features selected:", top_5_features)

# Filter training and testing sets using only top 5 features
X_train = X_train_full[top_5_features]
X_test = X_test_full[top_5_features]

# Step 5: Train models using top 5 features
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42)
}

results = {}
best_accuracy = 0
best_model = None
best_model_name = ""

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        "accuracy": acc,
        "classification_report": classification_report(y_test, y_pred)
    }
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Step 6: Print results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['accuracy']:.2f}")
    print("Classification Report:")
    print(metrics['classification_report'])

# Step 7: Save the best model
if best_model:
    with open("best_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
    print(f"\nBest model '{ LogisticRegression}' saved as 'best_model.pkl'")

Top 5 features selected: ['Genetic_Mutation', 'Family_History', 'BMI', 'Age', 'Age_at_Menarche']


Parameters: { "use_label_encoder" } are not used.




Model: Random Forest
Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.78      0.74        90
           1       0.80      0.75      0.77       110

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200


Model: Logistic Regression
Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.81      0.78        90
           1       0.83      0.77      0.80       110

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200


Model: Decision Tree
Accuracy: 0.71
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.72      0.69        90
           1       0.75      0.70      0.73       1