In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/content/Lip feature.xlsx'  # Replace with the actual path
df = pd.read_excel(file_path)

# Separate features and target
X = df.drop(columns=['Survived >5 Years'])
y = df['Survived >5 Years']

# Identify categorical and numerical columns
categorical_columns = ['Sex(Female)', 'Sex(Male)', 'Diagnostic mapping', 
                       'Marital status mapping', 'AVA mapping', 'Race maping']
numerical_columns = [col for col in X.columns if col not in categorical_columns]

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_columns),
    ('cat', OneHotEncoder(drop='first'), categorical_columns)
])
X_processed = preprocessor.fit_transform(X)

# Standardize features
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X_processed)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Address class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Define the neural network with additional layers
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_resampled.shape[1],)),

    # First hidden layer with batch normalization
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Second hidden layer with batch normalization
    tf.keras.layers.Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Third hidden layer
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Fourth hidden layer
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Fifth hidden layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Sixth hidden layer
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Seventh hidden layer
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Eighth hidden layer
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Output layer
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc_pr', curve='PR')])

# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    epochs=200,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weights,
    verbose=1
)

# Evaluate the model
evaluation_results = model.evaluate(X_test, y_test, verbose=0)

# Unpack the metrics
test_loss = evaluation_results[0]   # First value is the loss
test_accuracy = evaluation_results[1]  # Second value is the accuracy
test_auc_pr = evaluation_results[2]  # Third value is the AUC-PR metric

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test AUC-PR: {test_auc_pr}")

# Predict and tune threshold
y_pred_probs = model.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-7)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Predict using optimal threshold
y_pred = (y_pred_probs > optimal_threshold).astype(int).flatten()

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Plot training & validation loss
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training & Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot training & validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training & Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Plot training & validation AUC-PR
plt.figure(figsize=(12, 6))
plt.plot(history.history['auc_pr'], label='Train AUC-PR')
plt.plot(history.history['val_auc_pr'], label='Validation AUC-PR')
plt.title('Training & Validation AUC-PR')
plt.xlabel('Epochs')
plt.ylabel('AUC-PR')
plt.legend()
plt.show()


ModuleNotFoundError: No module named 'pandas'