In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sheet_id = "10owNxo07iXD2Al0IFtifrNBAjeoyWpHSOV18ycatZ8U"
worksheet_name = "Sheet1"


The code defines a Google Sheets ID (sheet_id) and a worksheet name (worksheet_name) for future reference

In [None]:
!pip install --upgrade gspread
!pip install pandas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import andrews_curves
import seaborn as sns
import numpy as np
from google.colab import auth
import gspread
from google.auth import default
import os
from google.colab import files
import torch
import torch.nn as nn
import torch.optim as optim
from imblearn.over_sampling import ADASYN
from sklearn.impute import SimpleImputer
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import random

path_train = '/content/drive/MyDrive/Data/train_data.csv'
train_df = pd.read_csv(path_train)

path_val = '/content/drive/MyDrive/Data/val_data.csv'
val_df = pd.read_csv(path_val)

Importing Libraries to set up the notebook, and getting the dataset ready to be used by the AI model.

In [None]:
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
sh = gc.open_by_key(sheet_id)
sh.worksheets()
worksheet = sh.worksheet(worksheet_name)
worksheet.get_all_records()
df = pd.DataFrame(worksheet.get_all_records())

The code retrieves all records from the worksheet, and converts them into a pandas DataFrame (df).

In [None]:
X = train_df.columns.difference(['diabetes'])
y = ['diabetes']

X_train = train_df[X]
print('X_train, our input variables:')
print(X_train.head())
print()

y_train = train_df[y]
print('y_train, our output variable:')
print(y_train.head())

X_train, our input variables:
   HbA1c_level   age  blood_glucose_level    bmi  gender  heart_disease  \
0          4.0  36.0                  145  17.06     1.0              0   
1          6.1  58.0                  159  37.83     0.0              0   
2          5.0  43.0                  160  39.20     0.0              0   
3          5.7  67.0                  159  28.39     1.0              0   
4          6.0   5.0                   80  27.32     0.0              0   

   hypertension  smoking_history_current  smoking_history_ever  \
0             0                        0                     0   
1             0                        0                     0   
2             0                        0                     0   
3             1                        0                     0   
4             0                        0                     0   

   smoking_history_former  smoking_history_never  smoking_history_not current  
0                       0                 

Setting up the model by mentioning which columns represent the input features and which column is the output label.

In [None]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

In [None]:
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train_imputed, y_train)

In [None]:
X_val = val_df[X]
y_val = val_df[y]

In [None]:
scaler = StandardScaler()
X_adasyn_scaled = scaler.fit_transform(X_adasyn)
X_val_scaled = scaler.transform(X_val)

The code uses a StandardScaler to standardize the features of two datasets, X_adasyn and X_val. The fit_transform method scales the training data (X_adasyn), while the transform method scales the validation data (X_val) using the parameters learned from the training data.

In [None]:
# Convert pandas DataFrames to PyTorch tensors
X_adasyn_tensor = torch.tensor(X_adasyn_scaled, dtype=torch.float32)
y_adasyn_tensor = torch.tensor(y_adasyn.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

The code converts the standardized datasets (X_adasyn_scaled, y_adasyn, X_val_scaled, y_val) into PyTorch tensors (X_adasyn_tensor, y_adasyn_tensor, X_val_tensor, y_val_tensor) with a specified data type of float32.

In [None]:
class DiabetesPredictionModel(nn.Module):
    def __init__(self, input_size):
        super(DiabetesPredictionModel, self).__init__()
        self.conv_stack = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Flatten()
        )

        # Calculate the output size of the convolutional layers
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, input_size)
            conv_output_size = self._get_conv_output_size(dummy_input)

        self.linear_stack = nn.Sequential(
            nn.Linear(conv_output_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )

    def _get_conv_output_size(self, x):
        x = self.conv_stack(x)
        return x.view(x.size(0), -1).size(1)

    def forward(self, x):
        x = self.conv_stack(x)
        logits = self.linear_stack(x)
        return logits

The code defines a neural network model (DiabetesPredictionModel) using PyTorch's nn.Module. It consists of convolutional layers followed by batch normalization, ReLU activation, and max pooling. The convolutional layers are then flattened, and the output is passed through fully connected layers with ReLU activation, dropout, and a final linear layer for binary classification. The model is designed for predicting diabetes based on input data with a specified size.

In [None]:
# Define custom dataset for training and testing
class DiabetesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.X)

The code creates a PyTorch dataset class (DiabetesDataset) for managing input features (X) and labels (y). It includes methods for accessing individual items and obtaining the dataset's length.

In [None]:
os.makedirs('/content/drive/MyDrive/NN_Models', exist_ok=True)

In [None]:
# Create data loaders for training and validation
adasyn_dataset = DiabetesDataset(X_adasyn_tensor.unsqueeze(1), y_adasyn_tensor)  # Add an extra dimension for input channels
val_dataset = DiabetesDataset(X_val_tensor.unsqueeze(1), y_val_tensor)  # Add an extra dimension for input channels
adasyn_loader = DataLoader(adasyn_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

The code sets up PyTorch datasets (adasyn_dataset and val_dataset) and corresponding data loaders (adasyn_loader and val_loader). The input features are adjusted to include an additional dimension for input channels using unsqueeze(1). The DataLoader is configured with batch sizes and optional shuffling for the training dataset.

In [None]:
max_list_length = 5

# Define the run_model_experiment function
def run_model_experiment(experiment_number):
    # Generate random learning rate, factor, and patience values
    lr = random.uniform(1e-5, 1e-3)
    hyp1 = random.uniform(0.05, 0.9)
    hyp2 = random.randint(5, 15)

    # Instantiate the model
    input_size = X_adasyn_tensor.shape[1]
    model = DiabetesPredictionModel(input_size)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=hyp1, patience=hyp2, verbose=True)

    num_epochs = 5
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in adasyn_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.squeeze())
            loss.backward()

            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        train_loss /= len(adasyn_dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets.squeeze())
                val_loss += loss.item() * inputs.size(0)
        val_loss /= len(val_dataset)

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'/content/drive/MyDrive/Models/model_{experiment_number}.pt')

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.2f}, Val Loss: {val_loss:.2f}")

    model.load_state_dict(torch.load(f'/content/drive/MyDrive/Models/model_{experiment_number}.pt'))

    model.eval()
    with torch.no_grad():
        y_pred_adasyn = (model(X_adasyn_tensor.unsqueeze(1)) > 0.5).squeeze().numpy()
        y_pred_val = (model(X_val_tensor.unsqueeze(1)) > 0.5).squeeze().numpy()

    val_accuracy = round(accuracy_score(y_val, y_pred_val) * 100, 1)

    # Calculate diabetes prediction accuracy
    cm_val = confusion_matrix(y_val, y_pred_val)

    diabetes_accuracy_val = round(cm_val[1][1] / (cm_val[1][1] + cm_val[1][0]) * 100, 1)

    print("Val Accuracy:", val_accuracy)

    print("Val Diabetes Accuracy:", diabetes_accuracy_val)

    # Format lr and accuracies for Google Sheets
    lr_str = "{:.2E}".format(lr)
    val_accuracy_str = str(val_accuracy) + '%'
    diabetes_accuracy_val_str = str(diabetes_accuracy_val) + '%'

    # Add experiment details to Google Sheets
    worksheet.append_row([experiment_number, lr_str, hyp1, hyp2, val_accuracy_str, diabetes_accuracy_val_str])

adasyn_loader = DataLoader(adasyn_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Run experiments
experiment_number = 42 # change after every run
for _ in range(max_list_length):
    run_model_experiment(experiment_number)
    experiment_number += 1


The code defines a function (run_model_experiment) that runs a machine learning experiment with a randomized set of hyperparameters. It trains a diabetes prediction model using PyTorch, monitors validation loss, and saves the best model. The experiments are conducted multiple times, and the results, including accuracy metrics, are printed and stored in the Google Sheet named as 'CNN Model tracking'.