In [14]:
# [1] Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# [2] Load dataset
df = pd.read_csv("C:\\Users\\maths\\Downloads\\cancer.csv")

# [3] Remove rows with missing values
df.dropna(inplace=True)

# [4] Select relevant features (age groups and lung cancer racial categories)
selected_features = [
    "Rates.Age.< 18", "Rates.Age.18-45", "Rates.Age.45-64", "Rates.Age.> 64",
    "Types.Lung.Race.White", "Types.Lung.Race.Black", "Types.Lung.Race.Hispanic"
]
X = df[selected_features]

# [5] Define binary target variable (1 if 'Total.Rate' is above 75th percentile)
df['Target'] = (df['Total.Rate'] >= df['Total.Rate'].quantile(0.75)).astype(int)
y = df['Target']

# [6] Standardize features for GaussianNB & BernoulliNB
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# [7] Apply PCA for GaussianNB & BernoulliNB (MultinomialNB cannot use negative values)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# [8] Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# [9] Train & evaluate GaussianNB and BernoulliNB
models = {
    "GaussianNB (PCA Applied)": GaussianNB(),
    "BernoulliNB (PCA Applied)": BernoulliNB(),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n===== {name} Performance =====")
    print(classification_report(y_test, y_pred, zero_division=1))
    print(f"AUC-ROC: {roc_auc_score(y_test, y_pred):.4f}")

# [10] Apply MinMaxScaler (0-1 range) for MultinomialNB
scaler_minmax = MinMaxScaler()
X_scaled_mnb = scaler_minmax.fit_transform(X)

# [11] Split dataset for MultinomialNB
X_train_mnb, X_test_mnb, y_train_mnb, y_test_mnb = train_test_split(X_scaled_mnb, y, test_size=0.2, random_state=42)

# [12] Train & evaluate MultinomialNB
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_mnb, y_train_mnb)
y_pred_mnb = multinomial_nb.predict(X_test_mnb)

print("\n===== MultinomialNB (MinMax Scaled) Performance =====")
print(classification_report(y_test_mnb, y_pred_mnb, zero_division=1))
print(f"AUC-ROC: {roc_auc_score(y_test_mnb, y_pred_mnb):.4f}")



===== GaussianNB (PCA Applied) Performance =====
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.75      1.00      0.86         3

    accuracy                           0.91        11
   macro avg       0.88      0.94      0.90        11
weighted avg       0.93      0.91      0.91        11

AUC-ROC: 0.9375

===== BernoulliNB (PCA Applied) Performance =====
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.75      1.00      0.86         3

    accuracy                           0.91        11
   macro avg       0.88      0.94      0.90        11
weighted avg       0.93      0.91      0.91        11

AUC-ROC: 0.9375

===== MultinomialNB (MinMax Scaled) Performance =====
              precision    recall  f1-score   support

           0       0.73      1.00      0.84         8
           1       1.00      0.00      0.00      