In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler,LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [53]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets


# variable information
print(breast_cancer_wisconsin_diagnostic.variables)

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [17]:
print(y)

    Diagnosis
0           M
1           M
2           M
3           M
4           M
..        ...
564         M
565         M
566         M
567         M
568         B

[569 rows x 1 columns]


In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

        self.label_encoder = LabelEncoder()
        self.y = self.label_encoder.fit_transform(self.y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        # Convert X to tensor
        X = torch.tensor(self.X.iloc[idx].values, dtype=torch.float32)
        
        y = torch.tensor(self.y[idx], dtype=torch.long)  # y is a Series, no need for DataFrame check
        
        if self.transform:
            X = self.transform(X)
        
        return X, y


class CSVDataModule:
    """DataModule for loading CSV-like data (like UCI datasets)."""
    def __init__(self, X, y, batch_size=64, test_size=0.2, transform=None):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.test_size = test_size
        self.transform = transform
        self.train_data = None
        self.val_data = None

    def setup(self):
        """Setup the training and validation datasets."""
        # Split the data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=self.test_size)

        # Normalize features (optional step)
        scaler = StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

        # Create datasets
        self.train_data = MyDataset(X_train, y_train, transform=self.transform)
        self.val_data = MyDataset(X_val, y_val, transform=self.transform)

    def train_dataloader(self):
        """Return train DataLoader."""
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        """Return validation DataLoader."""
        return DataLoader(self.val_data, batch_size=self.batch_size, shuffle=False)

In [39]:
csv_data_module = CSVDataModule(X,y)
csv_data_module.setup()

train_loader = csv_data_module.train_dataloader()
val_loader = csv_data_module.val_dataloader()

for batch_X, batch_y in train_loader:
    print(batch_X.shape, batch_y.shape)  

torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([64, 30]) torch.Size([64])
torch.Size([7, 30]) torch.Size([7])


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [41]:
input_dim = batch_X.shape[1]
print(f"Input Dimension: {input_dim}")

Input Dimension: 30


In [63]:
class SimpleModel(nn.Module):
    """Some Information about MyModule"""
    def __init__(self, inputDim):
        super(SimpleModel, self).__init__()
        self.l1 = nn.Linear(inputDim,128)
        self.l2 = nn.Linear(128,64)
        self.output = nn.Linear(64,2)
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.output(x)
        return x

In [64]:
# Move model to the device
model = SimpleModel(input_dim).to(device)

criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, epochs=20):
    # loop over the dataset multiple times
    for epoch in range(epochs):
        running_loss = 0.0
        for batch_X, batch_y in train_loader:
            # Move data to the device
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
    
            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            outputs = model(batch_X)  # The model is already on the correct device
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item()

        # Print average loss per epoch
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")


In [65]:
train_model(model,train_loader,criterion,optimizer,epochs=100)

Epoch [1/100], Loss: 0.5735
Epoch [2/100], Loss: 0.3471
Epoch [3/100], Loss: 0.2325
Epoch [4/100], Loss: 0.1277
Epoch [5/100], Loss: 0.0903
Epoch [6/100], Loss: 0.0700
Epoch [7/100], Loss: 0.0667
Epoch [8/100], Loss: 0.0528
Epoch [9/100], Loss: 0.0503
Epoch [10/100], Loss: 0.0458
Epoch [11/100], Loss: 0.0765
Epoch [12/100], Loss: 0.0656
Epoch [13/100], Loss: 0.0375
Epoch [14/100], Loss: 0.0383
Epoch [15/100], Loss: 0.0523
Epoch [16/100], Loss: 0.0347
Epoch [17/100], Loss: 0.0276
Epoch [18/100], Loss: 0.0261
Epoch [19/100], Loss: 0.0240
Epoch [20/100], Loss: 0.0226
Epoch [21/100], Loss: 0.0220
Epoch [22/100], Loss: 0.0205
Epoch [23/100], Loss: 0.0336
Epoch [24/100], Loss: 0.0188
Epoch [25/100], Loss: 0.0202
Epoch [26/100], Loss: 0.0167
Epoch [27/100], Loss: 0.0148
Epoch [28/100], Loss: 0.0143
Epoch [29/100], Loss: 0.0132
Epoch [30/100], Loss: 0.0125
Epoch [31/100], Loss: 0.0110
Epoch [32/100], Loss: 0.0103
Epoch [33/100], Loss: 0.0103
Epoch [34/100], Loss: 0.0089
Epoch [35/100], Loss: 0

In [70]:
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # No gradients needed for evaluation
        for batch_X, batch_y in test_loader:
            # Move inputs and labels to the correct device
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            # Forward pass
            outputs = model(batch_X)
            
            # Get the predicted class
            _, predicted = torch.max(outputs, 1)
            
            # Update total and correct counts
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")


In [72]:
evaluate_model(model, val_loader,device)

Test Accuracy: 98.25%
