In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from imblearn.combine import SMOTEENN
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
import shap
import json
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
# Load the dataset
file_path = '/content/Gestational Diabetic Dat Set.csv'
data = pd.read_csv(file_path)
# Rename the target column for simplicity
data.rename(columns={"Class Label(GDM /Non GDM)": "Target"}, inplace=True)
# Drop unnecessary columns
data = data.drop(columns=["Case Number"], errors='ignore')
# Handle missing values by replacing with column mean
data.fillna(data.mean(), inplace=True)
# Display dataset summary
print("Dataset Overview:")
print(data.info())
print("\nFirst 5 Rows:")
print(data.head())
# Generate visualizations during preprocessing
# Histograms for feature distributions
"""This code generates and saves histograms of numerical features in a dataset to visualize their distributions.
It adjusts layout for clarity, adds a title, and uses customizable bin sizes, colors, and edges to analyze patterns, outliers, or imbalances."""
data.hist(figsize=(16, 12), bins=20, color='skyblue', edgecolor='black')
plt.suptitle('Feature Distributions', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("feature_distributions.png")
plt.show()
# Pair Plot for relationships
"""The code generates a pair plot using Seaborn, visualizing feature relationships and target class distinctions.
It uses KDE on the diagonal, distinct colors for each class with hue="Target", and saves the plot as a PNG file for analysis. """
sns.pairplot(data, hue="Target", palette="Set2", diag_kind="kde")
plt.savefig("pair_plot.png")
plt.show()
# Correlation Heatmap
"""The code creates a correlation heatmap with Seaborn to display the relationships between numerical features in the dataset.
It computes the correlation matrix with data.corr(), annotates values on the heatmap, and applies the coolwarm color scheme.
 The plot is saved as "correlation_heatmap.png" for later use."""
plt.figure(figsize=(12, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.savefig("correlation_heatmap.png")
plt.show()
# Box Plots to detect outliers
"""The code generates box plots for numerical features in the dataset to detect outliers.
It selects columns with numerical data, creates a box plot for each feature grouped by the "Target" variable, and uses the "Set1" color palette.
Each plot is saved as a PNG file for further review."""
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="Target", y=feature, palette="Set1")
    plt.title(f"Box Plot of {feature} by Target")
    plt.savefig(f"box_plot_{feature}.png")
    plt.show()
    # Analyze the Target column distribution
"""The code analyzes the distribution of the "Target" column, printing the count of each class before applying any resampling techniques like SMOTE-ENN.
It uses value_counts() to get the frequency of each class in the "Target" column, then prints the counts for Class 0 and Class 1.
This helps assess class imbalance in the dataset."""
print("\nTarget Class Distribution (Before SMOTE-ENN):")
class_counts = data['Target'].value_counts()
print(f"Class 0: {class_counts[0]}")  # Count of 0s
print(f"Class 1: {class_counts[1]}")  # Count of 1s
# Feature-target split
X = data.drop(columns=["Target"])
y = data["Target"]
# Encode target if it's categorical
if y.dtypes == 'object':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    # Standardize the features for better model performance
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Apply SMOTE-ENN to balance the dataset
smote_enn = SMOTEENN(random_state=42)
X_smote_enn, y_smote_enn = smote_enn.fit_resample(X, y)
# Feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_smote_enn = selector.fit_transform(X_smote_enn, y_smote_enn)
# Analyze the Target column distribution after SMOTE-ENN
print("\nTarget Class Distribution (After SMOTE-ENN):")
class_counts_smote_enn = pd.Series(y_smote_enn).value_counts()
print(f"Class 0: {class_counts_smote_enn[0]}")
print(f"Class 1: {class_counts_smote_enn[1]}")
# Reshape data for Conv1D (samples, time steps, features)
X_smote_enn_cnn = X_smote_enn[..., np.newaxis]  # Add a new axis for the feature dimension
# Split data into training and testing sets
X_train_cnn, X_test_cnn, y_train, y_test = train_test_split(X_smote_enn_cnn, y_smote_enn, test_size=0.2, random_state=42)
X_train, X_test, _, _ = train_test_split(X_smote_enn, y_smote_enn, test_size=0.2, random_state=42)
# Build the CNN model
""" The code defines a Convolutional Neural Network (CNN) using Keras.
It starts with a Conv1D layer (128 filters, kernel size 3) followed by MaxPooling1D to downsample.
A second Conv1D layer (64 filters) and pooling layer are applied. The Flatten layer reshapes the output for dense layers.
A Dense layer with 128 units and ReLU activation follows, then a Dropout layer (40% rate) helps prevent overfitting.
 The final Dense layer outputs a single value with a sigmoid activation for binary classification.
 The model is compiled with Adam optimizer, binary cross-entropy loss, and accuracy as the metric.
This structure leverages convolutional layers for feature extraction, pooling for dimensionality reduction, and dense layers for decision-making."""
cnn_model = Sequential([
    Conv1D(128, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# Train CNN model
cnn_history = cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=150, batch_size=32, verbose=1)

cnn_model.save("cnn_model.h5")

# Hyperparameter tuning for SVM
""" The code performs hyperparameter tuning for a Support Vector Machine (SVM) using GridSearchCV.
It initializes an SVM model with probability=True and a fixed random_state=42.
 The hyperparameters to be tuned are C (regularization parameter), kernel (either 'linear' or 'rbf'), and gamma (either 'scale' or 'auto').
 The grid search uses 5-fold cross-validation (cv=5) and accuracy as the scoring metric.
  After fitting the model, the best estimator is retrieved with grid_svm.best_estimator_.
This process helps find the optimal hyperparameters for improved model performance."""
svm = SVC(probability=True, random_state=42)
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_svm = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)
svm_best = grid_svm.best_estimator_
# Train Random Forest model with hyperparameter tuning
""" The code performs hyperparameter tuning for a Random Forest classifier using GridSearchCV.
It tunes the n_estimators (number of trees) and max_depth (maximum depth of trees) hyperparameters.
The grid search uses 5-fold cross-validation (cv=5) and accuracy as the scoring metric.
The best model is obtained via grid_rf.best_estimator_."""
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
rf_best = grid_rf.best_estimator_
# Gradient Boosting Classifier
""" The code defines and trains a Gradient Boosting Classifier model.
 It initializes the model with n_estimators=100 (number of boosting stages), learning_rate=0.1 (shrinkage factor to control model contribution per stage), and max_depth=3 (maximum depth of individual trees).
 The random_state=42 ensures reproducibility.
 After setting the parameters, the model is trained on the training data (X_train, y_train) using the fit() method.
Gradient Boosting helps improve predictive performance by combining weak learners iteratively."""
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)
# Combine models using VotingClassifier
""" The code combines multiple models into an ensemble using VotingClassifier.
It includes a Support Vector Machine (svm_best), a Random Forest (rf_best), and a Gradient Boosting Model (gbm_model).
The voting='soft' parameter specifies that predictions are made based on the weighted average of predicted probabilities, improving model accuracy."""
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_best), ('rf', rf_best), ('gbm', gbm_model)],
    voting='soft'
)
# Train the ensemble model
ensemble_model.fit(X_train, y_train)
# Evaluate the ensemble model
ensemble_accuracy = ensemble_model.score(X_test, y_test)
print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")
# Predictions
cnn_pred = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32")
svm_pred = svm_best.predict(X_test)
rf_pred = rf_best.predict(X_test)
gbm_pred = gbm_model.predict(X_test)
ensemble_pred = ensemble_model.predict(X_test)
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, ensemble_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig("confusion_matrix_ensemble.png")
plt.show()
# Classification Report
report = classification_report(y_test, ensemble_pred, output_dict=True)
print("\nClassification Report:")
print(classification_report(y_test, ensemble_pred))
# Save classification report
with open("classification_report_ensemble.json", "w") as f:
    json.dump(report, f)
    # ROC-AUC Curve
""" The code generates and plots the ROC-AUC curve for the ensemble model's performance.
It first calculates predicted probabilities (ensemble_pred_prob) for the test set, then computes the false positive rate (FPR) and true positive rate (TPR) using roc_curve.
 The AUC is calculated with auc().
 The ROC curve is plotted, showing the tradeoff between TPR and FPR, and the plot is saved as "roc_curve_ensemble.png".
The curve helps evaluate the model's classification ability."""
# Fit the ensemble model first
ensemble_model.fit(X_train, y_train)

# Generate predicted probabilities for the test set
ensemble_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]

# Calculate FPR, TPR, and AUC for the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, ensemble_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.savefig("roc_curve_ensemble.png")
plt.show()
# Explainability using SHAP
""" The code uses SHAP to explain the Random Forest model (rf_best).
It creates a TreeExplainer to compute SHAP values for the test set (X_test) and generates a summary plot to visualize feature importance and contributions to predictions."""
explainer = shap.TreeExplainer(rf_best)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=data.columns[:-1])




In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import SMOTEENN
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
import shap
import json
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
# Load the dataset
file_path = 'Gestational Diabetic Dat Set (1)-1.csv'
data = pd.read_csv(file_path)
# Rename the target column for simplicity
data.rename(columns={"Class Label(GDM /Non GDM)": "Target"}, inplace=True)
# Drop unnecessary columns
data = data.drop(columns=["Case Number"], errors='ignore')
# Handle missing values by replacing with column mean
data.fillna(data.mean(), inplace=True)
# Display dataset summary
print("Dataset Overview:")
print(data.info())
print("\nFirst 5 Rows:")
print(data.head())
# Generate visualizations during preprocessing
# Histograms for feature distributions
"""This code generates and saves histograms of numerical features in a dataset to visualize their distributions.
It adjusts layout for clarity, adds a title, and uses customizable bin sizes, colors, and edges to analyze patterns, outliers, or imbalances."""
data.hist(figsize=(16, 12), bins=20, color='skyblue', edgecolor='black')
plt.suptitle('Feature Distributions', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("feature_distributions.png")
plt.show()
# Pair Plot for relationships
"""The code generates a pair plot using Seaborn, visualizing feature relationships and target class distinctions.
It uses KDE on the diagonal, distinct colors for each class with hue="Target", and saves the plot as a PNG file for analysis. """
sns.pairplot(data, hue="Target", palette="Set2", diag_kind="kde")
plt.savefig("pair_plot.png")
plt.show()
# Correlation Heatmap
"""The code creates a correlation heatmap with Seaborn to display the relationships between numerical features in the dataset.
It computes the correlation matrix with data.corr(), annotates values on the heatmap, and applies the coolwarm color scheme.
 The plot is saved as "correlation_heatmap.png" for later use."""
plt.figure(figsize=(12, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.savefig("correlation_heatmap.png")
plt.show()
# Box Plots to detect outliers
"""The code generates box plots for numerical features in the dataset to detect outliers.
It selects columns with numerical data, creates a box plot for each feature grouped by the "Target" variable, and uses the "Set1" color palette.
Each plot is saved as a PNG file for further review."""
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=data, x="Target", y=feature, palette="Set1")
    plt.title(f"Box Plot of {feature} by Target")
    plt.savefig(f"box_plot_{feature}.png")
    plt.show()
    # Analyze the Target column distribution
"""The code analyzes the distribution of the "Target" column, printing the count of each class before applying any resampling techniques like SMOTE-ENN.
It uses value_counts() to get the frequency of each class in the "Target" column, then prints the counts for Class 0 and Class 1.
This helps assess class imbalance in the dataset."""
print("\nTarget Class Distribution (Before SMOTE-ENN):")
class_counts = data['Target'].value_counts()
print(f"Class 0: {class_counts[0]}")  # Count of 0s
print(f"Class 1: {class_counts[1]}")  # Count of 1s
# Feature-target split
X = data.drop(columns=["Target"])
y = data["Target"]
# Encode target if it's categorical
if y.dtypes == 'object':
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    # Standardize the features for better model performance
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Apply SMOTE-ENN to balance the dataset
smote_enn = SMOTEENN(random_state=42)
X_smote_enn, y_smote_enn = smote_enn.fit_resample(X, y)
# Feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_smote_enn = selector.fit_transform(X_smote_enn, y_smote_enn)
# Analyze the Target column distribution after SMOTE-ENN
print("\nTarget Class Distribution (After SMOTE-ENN):")
class_counts_smote_enn = pd.Series(y_smote_enn).value_counts()
print(f"Class 0: {class_counts_smote_enn[0]}")
print(f"Class 1: {class_counts_smote_enn[1]}")
# Reshape data for Conv1D (samples, time steps, features)
X_smote_enn_cnn = X_smote_enn[..., np.newaxis]  # Add a new axis for the feature dimension
# Split data into training and testing sets
X_train_cnn, X_test_cnn, y_train, y_test = train_test_split(X_smote_enn_cnn, y_smote_enn, test_size=0.2, random_state=42)
X_train, X_test, _, _ = train_test_split(X_smote_enn, y_smote_enn, test_size=0.2, random_state=42)
# Build the CNN model
""" The code defines a Convolutional Neural Network (CNN) using Keras.
It starts with a Conv1D layer (128 filters, kernel size 3) followed by MaxPooling1D to downsample.
A second Conv1D layer (64 filters) and pooling layer are applied. The Flatten layer reshapes the output for dense layers.
A Dense layer with 128 units and ReLU activation follows, then a Dropout layer (40% rate) helps prevent overfitting.
 The final Dense layer outputs a single value with a sigmoid activation for binary classification.
 The model is compiled with Adam optimizer, binary cross-entropy loss, and accuracy as the metric.
This structure leverages convolutional layers for feature extraction, pooling for dimensionality reduction, and dense layers for decision-making."""
cnn_model = Sequential([
    Conv1D(128, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# Train CNN model
cnn_history = cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=150, batch_size=32, verbose=1)

cnn_model.save("cnn_model.h5")

# Hyperparameter tuning for SVM
""" The code performs hyperparameter tuning for a Support Vector Machine (SVM) using GridSearchCV.
It initializes an SVM model with probability=True and a fixed random_state=42.
 The hyperparameters to be tuned are C (regularization parameter), kernel (either 'linear' or 'rbf'), and gamma (either 'scale' or 'auto').
 The grid search uses 5-fold cross-validation (cv=5) and accuracy as the scoring metric.
  After fitting the model, the best estimator is retrieved with grid_svm.best_estimator_.
This process helps find the optimal hyperparameters for improved model performance."""
svm = SVC(probability=True, random_state=42)
svm_params = {'C': [0.1, 1, 15], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_svm = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)
svm_best = grid_svm.best_estimator_
# Train Random Forest model with hyperparameter tuning
""" The code performs hyperparameter tuning for a Random Forest classifier using GridSearchCV.
It tunes the n_estimators (number of trees) and max_depth (maximum depth of trees) hyperparameters.
The grid search uses 5-fold cross-validation (cv=5) and accuracy as the scoring metric.
The best model is obtained via grid_rf.best_estimator_."""
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
rf_best = grid_rf.best_estimator_
# Gradient Boosting Classifier
""" The code defines and trains a Gradient Boosting Classifier model.
 It initializes the model with n_estimators=100 (number of boosting stages), learning_rate=0.1 (shrinkage factor to control model contribution per stage), and max_depth=3 (maximum depth of individual trees).
 The random_state=42 ensures reproducibility.
 After setting the parameters, the model is trained on the training data (X_train, y_train) using the fit() method.
Gradient Boosting helps improve predictive performance by combining weak learners iteratively."""
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)
# Combine models using VotingClassifier
""" The code combines multiple models into an ensemble using VotingClassifier.
It includes a Support Vector Machine (svm_best), a Random Forest (rf_best), and a Gradient Boosting Model (gbm_model).
The voting='soft' parameter specifies that predictions are made based on the weighted average of predicted probabilities, improving model accuracy."""
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_best), ('rf', rf_best), ('gbm', gbm_model)],
    voting='soft'
)
# Train the ensemble model
ensemble_model.fit(X_train, y_train)
# Evaluate the ensemble model
ensemble_accuracy = ensemble_model.score(X_test, y_test)
print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")
# Predictions
cnn_pred = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32")
svm_pred = svm_best.predict(X_test)
rf_pred = rf_best.predict(X_test)
gbm_pred = gbm_model.predict(X_test)
ensemble_pred = ensemble_model.predict(X_test)
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, ensemble_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig("confusion_matrix_ensemble.png")
plt.show()
# Classification Report
report = classification_report(y_test, ensemble_pred, output_dict=True)
print("\nClassification Report:")
print(classification_report(y_test, ensemble_pred))
# Save classification report
with open("classification_report_ensemble.json", "w") as f:
    json.dump(report, f)
    # ROC-AUC Curve
""" The code generates and plots the ROC-AUC curve for the ensemble model's performance.
It first calculates predicted probabilities (ensemble_pred_prob) for the test set, then computes the false positive rate (FPR) and true positive rate (TPR) using roc_curve.
 The AUC is calculated with auc().
 The ROC curve is plotted, showing the tradeoff between TPR and FPR, and the plot is saved as "roc_curve_ensemble.png".
The curve helps evaluate the model's classification ability."""
# Fit the ensemble model first
ensemble_model.fit(X_train, y_train)

# Generate predicted probabilities for the test set
ensemble_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]

# Calculate FPR, TPR, and AUC for the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, ensemble_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.savefig("roc_curve_ensemble.png")
plt.show()
# Explainability using SHAP
""" The code uses SHAP to explain the Random Forest model (rf_best).
It creates a TreeExplainer to compute SHAP values for the test set (X_test) and generates a summary plot to visualize feature importance and contributions to predictions."""
explainer = shap.TreeExplainer(rf_best)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=data.columns[:-1])




In [3]:
import pickle

In [4]:
with open('svm_best.pkl', 'wb') as f:
    pickle.dump(svm_best, f)

In [5]:
with open('rf_best.pkl', 'wb') as f:
    pickle.dump(rf_best, f)

In [6]:
with open('ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)