In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

### Import mamtorch library
To install mamtorch, in mamtorch root folder, "pip install ." (to improve compiling time, do this after installing ninja through "pip install ninja")

In [2]:
import mamtorch as mam

### Select the GPU
Currently, MAM kernels are implemented only for usage on GPU

In [3]:
# Select GPU
gpu_id = 0
# Check if the GPU is available, and if so, use it
device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
# You need a gpu to use MAM kernel! (No cpu-based implementation available)
if(device == "cpu"):
    raise "No GPU device available! MAM kernels are not available."

### Define a simple feedforward DNN containing a MAM layer

In [20]:
# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
        super(SimpleNN, self).__init__()
        # Instantiate a MAC fc layer
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        
        # Instantiate a MAM fc layer
        self.fc2_mam = mam.nn.FullyConnected(hidden_size1, hidden_size2, bias=True, splits=2, vcon_steps=4, compute_exact=False)
        self.relu2 = nn.ReLU()
        
        # Instantiate the output layer
        self.fc3 = nn.Linear(hidden_size2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2_mam(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x
    
# Hyperparameters
input_size = 28 * 28  # MNIST image size
hidden_size1 = 512
hidden_size2 = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 10

Load MNIST dataset and apply transformations

In [21]:
num_workers = 2 # increase this to use more threads to manage data

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
valtest_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)
val_dataset, test_dataset = torch.utils.data.random_split(valtest_dataset, [0.5, 0.5], generator=torch.Generator().manual_seed(42))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

Initialize the model, loss function, and optimizer

In [22]:
model = SimpleNN(input_size, hidden_size1, hidden_size2, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

### Train the network

In [23]:
# Initialize the selection matrix list.
# Here, for each training epoch, we store the number of times each interconnection has been used
# I.E., the selection count
selection_matrix_list = []

# Training loop
for epoch in range(num_epochs):
    start_time = time.perf_counter()
    print(f"Epoch [{epoch + 1}/{num_epochs}]")

    model.train() # se training mode

    correct = 0
    total = 0
    running_loss = 0
    total_step = len(train_loader)
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28 * 28).to(device)  # Flatten the input images
        outputs = model(images)
        loss = criterion(outputs, labels.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # get the predicted class
        _, predicted = torch.max(outputs.data, 1)

        # evaluate the correct values agaist the total evaluated
        correct += (predicted == labels.to(device)).sum().item()
        total += labels.size(0)

        print(f'Training [{i + 1}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%', end='\r')
    print(f'Training [{total_step}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%')
    
    model.eval() # set evaluation mode
    
    correct = 0
    total = 0
    running_loss = 0
    total_step = len(val_loader)
    for images, labels in val_loader:
        images = images.view(-1, 28 * 28).to(device)  # Flatten the input images
        outputs = model(images)
        loss = criterion(outputs, labels.to(device))
        
        running_loss += loss
        
        # get the predicted class
        _, predicted = torch.max(outputs.data, 1)
        
        # evaluate the correct values agaist the total evaluated
        correct += (predicted == labels.to(device)).sum().item()
        total += labels.size(0)

        print(f'Validation [{i + 1}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%', end='\r')
    print(f'Validation [{total_step}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%')
    
    # update the value of beta for vanishing contributes
    model.fc2_mam.vcon_step()
    
    print(f"Elapsed time = {time.perf_counter()-start_time:.3f} s")
    
print("Training end.")

Epoch [1/10]
Training [938/938], Loss: 7.619e-03, Acc: 82.1%
Validation [79/79], Loss: 7.163e-03, Acc: 84.2%%
Elapsed time = 128.738 s
Epoch [2/10]


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
# Test the model on the test dataset
model.eval() # set evaluation mode

correct = 0
total = 0
running_loss = 0
total_step = len(val_loader)
for images, labels in test_loader:
    images = images.view(-1, 28 * 28).to(device)  # Flatten the input images
    outputs = model(images)
    loss = criterion(outputs, labels.to(device))
    
    running_loss += loss
    
    # get the predicted class
    _, predicted = torch.max(outputs.data, 1)
    
    # evaluate the correct values agaist the total evaluated
    correct += (predicted == labels.to(device)).sum().item()
    total += labels.size(0)

    print(f'Test [{i + 1}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%', end='\r')
print(f'Test [{total_step}/{total_step}], Loss: {running_loss/total:.3e}, Acc: {correct/total*100:.1f}%')

Test [79/79], Loss: 5.428e-03, Acc: 88.0%%
