In [1]:
import os
!pip install pandas numpy scikit-learn imbalanced-learn mlxtend tensorflow

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from mlxtend.frequent_patterns import apriori, association_rules
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Input
import tensorflow.keras.backend as K



**CORRECTED CODE**

In [2]:
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"  # Prevents memory allocation issues
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU if GPU issues exist

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Running on CPU.")

No GPU detected. Running on CPU.


In [3]:
# Load dataset from UCI URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data"
column_names = ['Class', 'Age', 'Menopause', 'Tumor-size', 'Inv-nodes', 'Node-caps', 'Deg-malig',
                'Breast', 'Breast-quad', 'Irradiat']

print("Loading dataset...")
data = pd.read_csv(url, names=column_names, na_values='?')
print("Dataset loaded successfully. Shape:", data.shape)

# Handle Missing Values
print("Handling missing values...")
data.fillna(data.mode().iloc[0], inplace=True)
print("Missing values filled.")

# Encode Target Variable
print("Encoding target variable...")
target_encoder = LabelEncoder()
data['Class'] = target_encoder.fit_transform(data['Class'])
print("Target variable encoded.")


Loading dataset...
Dataset loaded successfully. Shape: (286, 10)
Handling missing values...
Missing values filled.
Encoding target variable...
Target variable encoded.


In [4]:
# Separate Features and Target
X = data.drop(columns=['Class'])
y = data['Class']

# One-Hot Encode Categorical Variables
categorical_features = X.columns.tolist()
preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
X_encoded = preprocessor.fit_transform(X).toarray()  # Convert sparse matrix to dense array

# Convert Encoded Features to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=preprocessor.get_feature_names_out())
print("Feature encoding completed. Shape:", X_encoded.shape)

# Balance Dataset using SMOTE
print("Applying SMOTE for data balancing...")
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_smote, y_smote = smote.fit_resample(X_encoded, y)
print("SMOTE applied. Balanced dataset shape:", X_smote.shape)


Feature encoding completed. Shape: (286, 41)
Applying SMOTE for data balancing...
SMOTE applied. Balanced dataset shape: (402, 41)


In [5]:
def build_generator(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64),
        LeakyReLU(negative_slope=0.2),
        Dense(128),
        LeakyReLU(negative_slope=0.2),
        Dense(output_dim, activation='tanh')
    ])
    return model

def build_discriminator(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128),
        LeakyReLU(negative_slope=0.2),
        Dense(64),
        LeakyReLU(negative_slope=0.2),
        Dense(1, activation='sigmoid')
    ])
    return model

print("Building GAN models...")
generator = build_generator(X_smote.shape[1], X_smote.shape[1])
discriminator = build_discriminator(X_smote.shape[1])
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("GAN models built successfully.")


Building GAN models...
GAN models built successfully.


In [6]:
def train_gan(epochs=1000, batch_size=8):  # Reduced batch size for memory efficiency
    tf.keras.backend.clear_session()  # Reset session before training
    print("Starting GAN training...")

    for epoch in range(epochs):
        noise = np.random.normal(0, 1, (batch_size, X_smote.shape[1]))
        generated_data = generator(noise, training=True)

        real_indices = np.random.randint(0, X_smote.shape[0], batch_size)
        real_data = X_smote.iloc[real_indices].values.astype(np.float32)  # Ensure TensorFlow compatible format
        labels_real = np.ones((batch_size, 1))
        labels_fake = np.zeros((batch_size, 1))

        d_loss_real = discriminator.train_on_batch(real_data, labels_real)
        d_loss_fake = discriminator.train_on_batch(generated_data, labels_fake)

        gan_loss = discriminator.train_on_batch(generated_data, labels_real)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss Real: {d_loss_real[0]}, D Loss Fake: {d_loss_fake[0]}, GAN Loss: {gan_loss}")

print("Training GAN...")
train_gan()
print("GAN training completed.")


Training GAN...
Starting GAN training...
Epoch 0: D Loss Real: 0.6460530757904053, D Loss Fake: 0.7137609720230103, GAN Loss: [array(0.6901722, dtype=float32), array(0.5833333, dtype=float32)]
Epoch 100: D Loss Real: 0.49100086092948914, D Loss Fake: 0.4916430413722992, GAN Loss: [array(0.49239215, dtype=float32), array(0.6563531, dtype=float32)]
Epoch 200: D Loss Real: 0.4809909462928772, D Loss Fake: 0.4813328683376312, GAN Loss: [array(0.4817236, dtype=float32), array(0.65588725, dtype=float32)]
Epoch 300: D Loss Real: 0.47655606269836426, D Loss Fake: 0.4768238961696625, GAN Loss: [array(0.4770474, dtype=float32), array(0.65614617, dtype=float32)]
Epoch 400: D Loss Real: 0.4740772843360901, D Loss Fake: 0.47426488995552063, GAN Loss: [array(0.47444808, dtype=float32), array(0.6557564, dtype=float32)]
Epoch 500: D Loss Real: 0.47233477234840393, D Loss Fake: 0.4724773168563843, GAN Loss: [array(0.47263187, dtype=float32), array(0.65568864, dtype=float32)]
Epoch 600: D Loss Real: 0.4

In [7]:
print("Performing Association Rule Mining using Apriori algorithm...")

data_apriori = X_smote.astype(bool)  # Convert to True/False (1/0)

frequent_itemsets = apriori(data_apriori, min_support=0.15, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.75)

print("Association Rule Mining completed. Rules found:", rules.shape[0])

Performing Association Rule Mining using Apriori algorithm...
Association Rule Mining completed. Rules found: 388


In [8]:
print("Training Random Forest Classifier with GridSearchCV...")

pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [10, 20]}
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, stratify=y_smote, test_size=0.2, random_state=42)
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
y_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))


Training Random Forest Classifier with GridSearchCV...
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.85      0.82        41
           1       0.84      0.78      0.81        40

    accuracy                           0.81        81
   macro avg       0.82      0.81      0.81        81
weighted avg       0.82      0.81      0.81        81

ROC-AUC Score: 0.9030487804878049


In [9]:
def predict_cancer(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)

    # Apply the same preprocessing used for training
    input_encoded = preprocessor.transform(input_df).toarray()
    input_encoded = pd.DataFrame(input_encoded, columns=preprocessor.get_feature_names_out())

    # Ensure all features match training data
    missing_cols = set(X_smote.columns) - set(input_encoded.columns)
    for col in missing_cols:
        input_encoded[col] = 0  # Add missing columns with zero

    input_encoded = input_encoded[X_smote.columns]  # Ensure correct column order

    # Make prediction
    prediction = grid.best_estimator_.predict(input_encoded)
    prediction_proba = grid.best_estimator_.predict_proba(input_encoded)[:, 1]

    result = "Cancer Detected" if prediction[0] == 1 else "No Cancer Detected"
    confidence = prediction_proba[0]

    print("Prediction:", result, "| Confidence:", confidence)
    return result, confidence


EXAMPLE INPUTS

In [10]:
# Example Usage
example_input = {
    'Age': '40-49', 'Menopause': 'premeno', 'Tumor-size': '30-34', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2, 'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'no'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: No Cancer Detected | Confidence: 0.30892742021745634
Prediction: No Cancer Detected, Confidence: 0.31


In [11]:
example_input = {
    'Age': '50-59', 'Menopause': 'ge40', 'Tumor-size': '50-54', 'Inv-nodes': '6-8',
    'Node-caps': 'yes', 'Deg-malig': 3,'Breast': 'right', 'Breast-quad': 'right_low', 'Irradiat': 'yes'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: Cancer Detected | Confidence: 0.812765444015444
Prediction: Cancer Detected, Confidence: 0.81


In [12]:
example_input = {
    'Age': '60-69', 'Menopause': 'ge-40', 'Tumor-size': '15-19', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2,'Breast': 'right', 'Breast-quad': 'left_up', 'Irradiat': 'no'
}
#60-69,ge40,15-19,0-2,no,2,right,left_up,no

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: No Cancer Detected | Confidence: 0.21254086798890662
Prediction: No Cancer Detected, Confidence: 0.21


In [13]:
example_input = {
    'Age': '30-39', 'Menopause': 'premeno', 'Tumor-size': '20-24', 'Inv-nodes': '3-5',
    'Node-caps': 'yes', 'Deg-malig': 2,'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'no'
}
#30-39,premeno,20-24,3-5,yes,2,left,left_low,no

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: Cancer Detected | Confidence: 0.7175413678379087
Prediction: Cancer Detected, Confidence: 0.72


In [14]:
#30-39,premeno,0-4,0-2,no,2,right,central,no

example_input = {
    'Age': '30-39', 'Menopause': 'premeno', 'Tumor-size': '0-4', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2,'Breast': 'right', 'Breast-quad': 'central', 'Irradiat': 'no'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: No Cancer Detected | Confidence: 0.4068955988071309
Prediction: No Cancer Detected, Confidence: 0.41


In [15]:
#50-59,ge40,40-44,6-8,yes,3,left,left_low,yes
example_input = {
    'Age': '50-59', 'Menopause': 'ge40', 'Tumor-size': '40-44', 'Inv-nodes': '6-8',
    'Node-caps': 'yes', 'Deg-malig': 3,'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'yes'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: Cancer Detected | Confidence: 0.951515444015444
Prediction: Cancer Detected, Confidence: 0.95


**FULL CODE**

In [16]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import GridSearchCV

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"  # Prevents memory allocation issues
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU if GPU issues exist

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Load dataset from UCI URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data"
column_names = ['Class', 'Age', 'Menopause', 'Tumor-size', 'Inv-nodes', 'Node-caps', 'Deg-malig',
                'Breast', 'Breast-quad', 'Irradiat']
data = pd.read_csv(url, names=column_names, na_values='?')

# Handle Missing Values
data.fillna(data.mode().iloc[0], inplace=True)

# Encode Target Variable
target_encoder = LabelEncoder()
data['Class'] = target_encoder.fit_transform(data['Class'])  # Encode target variable

# Separate Features and Target
X = data.drop(columns=['Class'])
y = data['Class']

# One-Hot Encode Categorical Variables
categorical_features = X.columns.tolist()
preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
X_encoded = preprocessor.fit_transform(X).toarray()  # Convert sparse matrix to dense array

# Convert Encoded Features to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=preprocessor.get_feature_names_out())

# Balance Dataset using SMOTE
smote = SMOTE(sampling_strategy='minority')
X_smote, y_smote = smote.fit_resample(X_encoded, y)

# GAN for Data Augmentation
def build_generator(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64),
        LeakyReLU(negative_slope=0.2),
        Dense(128),
        LeakyReLU(negative_slope=0.2),
        Dense(output_dim, activation='tanh')
    ])
    return model

def build_discriminator(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128),
        LeakyReLU(negative_slope=0.2),
        Dense(64),
        LeakyReLU(negative_slope=0.2),
        Dense(1, activation='sigmoid')
    ])
    return model

generator = build_generator(X_smote.shape[1], X_smote.shape[1])
discriminator = build_discriminator(X_smote.shape[1])
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train GAN with TensorFlow context
def train_gan(epochs=800, batch_size=8):  # Reduced batch size for memory efficiency
    tf.keras.backend.clear_session()  # Reset session before training
    for epoch in range(epochs):
        noise = np.random.normal(0, 1, (batch_size, X_smote.shape[1]))
        generated_data = generator(noise, training=True)

        real_indices = np.random.randint(0, X_smote.shape[0], batch_size)
        real_data = X_smote.iloc[real_indices].values.astype(np.float32)  # Ensure TensorFlow compatible format
        labels_real = np.ones((batch_size, 1))
        labels_fake = np.zeros((batch_size, 1))

        d_loss_real = discriminator.train_on_batch(real_data, labels_real)
        d_loss_fake = discriminator.train_on_batch(generated_data, labels_fake)

        gan_loss = discriminator.train_on_batch(generated_data, labels_real)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss Real: {d_loss_real[0]}, D Loss Fake: {d_loss_fake[0]}, GAN Loss: {gan_loss}")

train_gan()

# Association Rule Mining with Apriori
data_apriori = X_smote.astype(bool)  # Convert to True/False (1/0)

frequent_itemsets = apriori(data_apriori, min_support=0.15, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.75)

# Enhanced Classification Model with Random Forest & Grid Search
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [10, 20]}
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, stratify=y_smote, test_size=0.2, random_state=42)
grid.fit(X_train, y_train)

# Model Evaluation
y_pred = grid.best_estimator_.predict(X_test)
y_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

# Prediction Function
def predict_cancer(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)

    # Apply the same preprocessing used for training
    input_encoded = preprocessor.transform(input_df).toarray()
    input_encoded = pd.DataFrame(input_encoded, columns=preprocessor.get_feature_names_out())

    # Ensure all features match training data
    missing_cols = set(X_smote.columns) - set(input_encoded.columns)
    for col in missing_cols:
        input_encoded[col] = 0  # Add missing columns with zero

    input_encoded = input_encoded[X_smote.columns]  # Ensure correct column order

    # Make prediction
    prediction = grid.best_estimator_.predict(input_encoded)
    prediction_proba = grid.best_estimator_.predict_proba(input_encoded)[:, 1]

    result = "Cancer Detected" if prediction[0] == 1 else "No Cancer Detected"
    confidence = prediction_proba[0]

    return result, confidence

Epoch 0: D Loss Real: 0.5645670890808105, D Loss Fake: 0.6543660759925842, GAN Loss: [array(0.661758, dtype=float32), array(0.5833333, dtype=float32)]
Epoch 100: D Loss Real: 0.480855792760849, D Loss Fake: 0.48154687881469727, GAN Loss: [array(0.48227942, dtype=float32), array(0.65470296, dtype=float32)]
Epoch 200: D Loss Real: 0.4741106927394867, D Loss Fake: 0.4744380712509155, GAN Loss: [array(0.47485518, dtype=float32), array(0.65526533, dtype=float32)]
Epoch 300: D Loss Real: 0.4712628126144409, D Loss Fake: 0.471564918756485, GAN Loss: [array(0.47176713, dtype=float32), array(0.6551772, dtype=float32)]
Epoch 400: D Loss Real: 0.4696388840675354, D Loss Fake: 0.46982988715171814, GAN Loss: [array(0.47001478, dtype=float32), array(0.65544474, dtype=float32)]
Epoch 500: D Loss Real: 0.46857750415802, D Loss Fake: 0.4687296152114868, GAN Loss: [array(0.46888095, dtype=float32), array(0.6552728, dtype=float32)]
Epoch 600: D Loss Real: 0.46783554553985596, D Loss Fake: 0.4679729342460

In [17]:
#30-39,premeno,0-4,0-2,no,2,right,central,no

example_input = {
    'Age': '30-39', 'Menopause': 'premeno', 'Tumor-size': '0-4', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2,'Breast': 'right', 'Breast-quad': 'central', 'Irradiat': 'no'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: No Cancer Detected, Confidence: 0.48


In [18]:
#50-59,ge40,40-44,6-8,yes,3,left,left_low,yes
example_input = {
    'Age': '50-59', 'Menopause': 'ge40', 'Tumor-size': '40-44', 'Inv-nodes': '6-8',
    'Node-caps': 'yes', 'Deg-malig': 3,'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'yes'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")

Prediction: Cancer Detected, Confidence: 0.98


recurrent and non-recurrent


In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.combine import SMOTETomek
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import GridSearchCV

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"  # Prevents memory allocation issues
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU if GPU issues exist

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Load dataset from UCI URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data"
column_names = ['Class', 'Age', 'Menopause', 'Tumor-size', 'Inv-nodes', 'Node-caps', 'Deg-malig',
                'Breast', 'Breast-quad', 'Irradiat']
data = pd.read_csv(url, names=column_names, na_values='?')

# Handle Missing Values
data.fillna(data.mode().iloc[0], inplace=True)

# Encode Target Variable for Recurrent vs. Non-Recurrent Classification
target_encoder = LabelEncoder()
data['Class'] = target_encoder.fit_transform(data['Class'])  # 0: Non-Recurrent, 1: Recurrent

# Separate Features and Target
X = data.drop(columns=['Class'])
y = data['Class']

# One-Hot Encode Categorical Variables
categorical_features = X.columns.tolist()
preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
X_encoded = preprocessor.fit_transform(X).toarray()  # Convert sparse matrix to dense array

# Convert Encoded Features to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=preprocessor.get_feature_names_out())

# Balance Dataset using SMOTETomek
smote_tomek = SMOTETomek(sampling_strategy='auto')
X_smote, y_smote = smote_tomek.fit_resample(X_encoded, y)

# Enhanced Classification Model with Random Forest & Grid Search
pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {'classifier__n_estimators': [100, 200, 300], 'classifier__max_depth': [10, 20, 30]}
grid = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1', n_jobs=-1)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, stratify=y_smote, test_size=0.2, random_state=42)
grid.fit(X_train, y_train)

# Model Evaluation
y_pred = grid.best_estimator_.predict(X_test)
y_proba = grid.best_estimator_.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

# Prediction Function
def predict_cancer(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)

    # Apply the same preprocessing used for training
    input_encoded = preprocessor.transform(input_df).toarray()
    input_encoded = pd.DataFrame(input_encoded, columns=preprocessor.get_feature_names_out())

    # Ensure all features match training data
    missing_cols = set(X_smote.columns) - set(input_encoded.columns)
    for col in missing_cols:
        input_encoded[col] = 0  # Add missing columns with zero

    input_encoded = input_encoded[X_smote.columns]  # Ensure correct column order

    # Make prediction
    prediction = grid.best_estimator_.predict(input_encoded)
    prediction_proba = grid.best_estimator_.predict_proba(input_encoded)[:, 1]

    result = "Recurrent Breast Cancer" if prediction[0] == 1 else "Non-Recurrent Breast Cancer"
    confidence = prediction_proba[0]

    return result, confidence


              precision    recall  f1-score   support

           0       0.82      0.82      0.82        40
           1       0.82      0.82      0.82        40

    accuracy                           0.82        80
   macro avg       0.82      0.82      0.82        80
weighted avg       0.82      0.82      0.82        80

ROC-AUC Score: 0.9099999999999999


In [3]:
# Example: Non-Recurrent Case
example_input_1 = {
    'Age': '40-49', 'Menopause': 'premeno', 'Tumor-size': '30-34', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2, 'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'no'
}

# Example: Recurrent Case
example_input_2 = {
    'Age': '50-59', 'Menopause': 'ge40', 'Tumor-size': '50-54', 'Inv-nodes': '6-8',
    'Node-caps': 'yes', 'Deg-malig': 3, 'Breast': 'right', 'Breast-quad': 'right_up', 'Irradiat': 'yes'
}

# Making Predictions
result_1, confidence_1 = predict_cancer(example_input_1)
print(f"Prediction: {result_1}, Confidence: {confidence_1:.2f}")

result_2, confidence_2 = predict_cancer(example_input_2)
print(f"Prediction: {result_2}, Confidence: {confidence_2:.2f}")


Prediction: Non-Recurrent Breast Cancer, Confidence: 0.18
Prediction: Recurrent Breast Cancer, Confidence: 0.65


In [4]:
#no-recurrence-events,60-69,ge40,30-34,0-2,no,2,left,left_low,yes
example_input_3 = {
    'Age': '60-69', 'Menopause': 'ge40', 'Tumor-size': '30-34', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 2, 'Breast': 'left', 'Breast-quad': 'left_low', 'Irradiat': 'yes'
}

# Making Predictions
result_1, confidence_1 = predict_cancer(example_input_1)
print(f"Prediction: {result_1}, Confidence: {confidence_1:.2f}")

Prediction: Non-Recurrent Breast Cancer, Confidence: 0.18


In [5]:
#recurrence-events,40-49,premeno,30-34,3-5,no,2,right,left_up,no

example_input_4 = {
    'Age': '40-49', 'Menopause': 'premeno', 'Tumor-size': '30-34', 'Inv-nodes': '3-5',
    'Node-caps': 'no', 'Deg-malig': 2, 'Breast': 'right', 'Breast-quad': 'left_up', 'Irradiat': 'no'
}

# Making Predictions
result_1, confidence_1 = predict_cancer(example_input_4)
print(f"Prediction: {result_1}, Confidence: {confidence_1:.2f}")

Prediction: Recurrent Breast Cancer, Confidence: 0.74


In [6]:
#recurrence-events,30-39,premeno,40-44,0-2,no,1,left,left_up,no


example_input = {
    'Age': '30-39', 'Menopause': 'premeno', 'Tumor-size': '40-44', 'Inv-nodes': '0-2',
    'Node-caps': 'no', 'Deg-malig': 1,'Breast': 'left', 'Breast-quad': 'left_up', 'Irradiat': 'no'
}

result, confidence = predict_cancer(example_input)
print(f"Prediction: {result}, Confidence: {confidence:.2f}")


Prediction: Recurrent Breast Cancer, Confidence: 0.76
