In [7]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt

# Step 1: Load and preprocess the dataset
data = pd.read_csv('diabetes_prediction_dataset.csv')

# Encode categorical features
label_encoders = {}
categorical_columns = ['gender', 'smoking_history']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

X = data.drop(['diabetes'], axis=1)  # Features
y = data['diabetes']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Step 3: Define the model
class ClassificationModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.softmax(x)
        return x

input_dim = X_train.shape[1]
output_dim = len(data['diabetes'].unique())
model = ClassificationModel(input_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Implement learning rate scheduler
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

num_epochs = 1

# Training loop
train_losses = []
test_accuracies = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Evaluate the model on the test set
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    accuracy = correct / total
    test_accuracies.append(accuracy)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%')

    # Update learning rate
    scheduler.step()

torch.save(model.state_dict(), 'testmodel.pth')

# Adding SHAP for interpretation

# Load the trained model
model = ClassificationModel(input_dim, output_dim)
model.load_state_dict(torch.load('testmodel.pth'))
model.eval()

# Convert X_train to a PyTorch tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)

# Convert X_test to a PyTorch tensor
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Create a function to wrap the PyTorch model
def model_wrapper(x):
    x_tensor = torch.tensor(x, dtype=torch.float32)
    with torch.no_grad():
        output = model(x_tensor)
    return output.numpy()

# Create an explainer for your model using the KernelExplainer
explainer = shap.KernelExplainer(model_wrapper, shap.sample(X_train_tensor.numpy(), 10))

# Calculate SHAP values for the test dataset
shap_values = explainer.shap_values(X_test_tensor.numpy(), nsamples=10)

# Visualize SHAP values for a specific instance
shap.initjs()

# Generate SHAP plots
shap.force_plot(explainer.expected_value[1], shap_values[0][1], X_test.iloc[0])

# Display the plots
plt.show()

Epoch [1/1], Loss: 0.2880, Test Accuracy: 92.96%


  0%|          | 0/10000 [00:00<?, ?it/s]

Regressors in active set degenerate. Dropping a regressor, after 3 iterations, i.e. alpha=1.594e-03, with an active set of 3 regressors, and the smallest cholesky pivot element being 5.960e-08. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 3 iterations, i.e. alpha=1.594e-03, with an active set of 3 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 4 iterations, i.e. alpha=1.116e-03, with an active set of 4 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 4 iterations, i.e. alpha=4.778e-04, with an active set of 4 regressors, and the smallest cholesky pivot element being 5.960e-08. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a 

In [8]:
shap.save_html("shap_visualizations.html", shap.force_plot(explainer.expected_value[1], shap_values[0], X_test))

shap.plots.force is slow for many thousands of rows, try subsampling your data.
