In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Load and prep data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)
features = [str(i) for i in range(1, 10)]  # Features 1 through 9 (string indices '1' to '9')
X = df[features]
y = df['10']  # Feature 10 (index 10) as target, already binary (0 and 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 1: Default model results
print("=== Default Model Results ===")
default_models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'LinearSVM': SVC(kernel='linear', random_state=42, probability=True),
    'KernelSVM': SVC(kernel='rbf', random_state=42, probability=True),
    'NaiveBayes': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42, n_estimators=10),
    'XGBoost': XGBClassifier(random_state=42)
}

default_results = {}
for name, model in default_models.items():
    model.fit(X_train_scaled if name in ['LogisticRegression', 'KNN', 'LinearSVM', 'KernelSVM', 'XGBoost'] else X_train, y_train)
    y_pred = model.predict(X_test_scaled if name in ['LogisticRegression', 'KNN', 'LinearSVM', 'KernelSVM', 'XGBoost'] else X_test)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    default_results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    }
print(pd.DataFrame(default_results).T)

# Step 2: Parameter tuning with GridSearchCV (F1 scoring)
print("\n=== Parameter Tuning ===")
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train) if sum(y_train) > 0 else 1.0  # Handle potential zero positives

tuning_models = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, class_weight='balanced'),
        'params': {'C': [0.0004], 'max_iter': [65], 'solver': ['lbfgs', 'liblinear']}
    },
    'KNN': {
        'model': KNeighborsClassifier(n_neighbors=5),
        'params': {'weights': ['uniform'], 'metric': ['euclidean'], 'p': [1], 'algorithm': ['auto']}
    },
    'LinearSVM': {
        'model': SVC(kernel='linear', random_state=42, probability=True, class_weight='balanced'),
        'params': {'C': [0.005], 'max_iter': [155]}
    },
    'KernelSVM': {
        'model': SVC(kernel='rbf', random_state=42, probability=True, class_weight='balanced'),
        'params': {'C': [1.75], 'gamma': ['scale']}
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {'var_smoothing': [1e-9]}
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        'params': {'max_depth': [None], 'min_samples_split': [2], 'min_samples_leaf': [2], 'criterion': ['entropy'], 'max_leaf_nodes': [None], 'min_impurity_decrease': [0.0]}
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=10),
        'params': {'max_depth': [5], 'min_samples_split': [2], 'min_samples_leaf': [2], 'max_features': ['sqrt'], 'bootstrap': [True]}
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight),
        'params': {'n_estimators': [49], 'max_depth': [3], 'learning_rate': [0.08], 'colsample_bytree': [0.1], 'subsample': [0.8], 'gamma': [0], 'reg_alpha': [0], 'reg_lambda': [0]}
    }
}

best_models = {}
for name, info in tuning_models.items():
    grid = GridSearchCV(info['model'], info['params'], cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train_scaled if name in ['LogisticRegression', 'KNN', 'LinearSVM', 'KernelSVM', 'NaiveBayes', 'XGBoost'] else X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} Best Params: {grid.best_params_}")

# Step 3: Rerun with tuned models
print("\n=== Tuned Model Results ===")
tuned_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test_scaled if name in ['LogisticRegression', 'KNN', 'LinearSVM', 'KernelSVM', 'NaiveBayes', 'XGBoost'] else X_test)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    tuned_results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    }
print(pd.DataFrame(tuned_results).T)

=== Default Model Results ===




                    Accuracy        F1   ROC-AUC
LogisticRegression  0.960000  0.936937  0.996729
KNN                 0.960000  0.936937  0.996432
LinearSVM           0.960000  0.936937  0.996878
KernelSVM           0.971429  0.955752  0.996729
NaiveBayes          0.960000  0.940171  0.162504
DecisionTree        0.954286  0.929825  0.500000
RandomForest        0.954286  0.929825  0.500000
XGBoost             0.942857  0.909091  0.993904

=== Parameter Tuning ===
LogisticRegression Best Params: {'C': 0.0004, 'max_iter': 65, 'solver': 'liblinear'}
KNN Best Params: {'algorithm': 'auto', 'metric': 'euclidean', 'p': 1, 'weights': 'uniform'}
LinearSVM Best Params: {'C': 0.005, 'max_iter': 155}
KernelSVM Best Params: {'C': 1.75, 'gamma': 'scale'}
NaiveBayes Best Params: {'var_smoothing': 1e-09}
DecisionTree Best Params: {'criterion': 'entropy', 'max_depth': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2}
RandomForest Best Params: {'bo

