In [1]:
from data_loader import *
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as func
import torch.nn.init as torch_init
import torch.optim as optim

class LSTM(nn.Module):
    """ A basic LSTM model. 
    """
    
    def __init__(self, in_dim, out_dim, hid_dim, batch_size, no_layers =1):
        super(LSTM, self).__init__()
        #specify the input dimensions
        self.in_dim = in_dim
        #specify the output dimensions
        self.out_dim = out_dim
        #specify hidden layer dimensions
        self.hid_dim = hid_dim
        #specify the number of layers
        self.no_layers = no_layers  
        #self.batch_size=batch_size
        
        #initialise the LSTM
        self.model = nn.LSTM(self.in_dim, self.hid_dim, self.no_layers)
        self.outputs = nn.Linear(self.hid_dim, out_dim)

    def forward(self, batch,hidden=None):
        """Pass the batch of images through each layer of the network, applying 
        """
        lstm_out, hidden = self.model(batch, hidden)
        y_pred = self.outputs(lstm_out)
        #The input is expected to contain raw, unnormalized scores for each class according to documentation
        #tag_scores = func.softmax(y_pred,dim=2)
        #return tag_scores,hidden
        return y_pred,hidden

In [2]:
# Setup: initialize the hyperparameters/variables
num_epochs = 5           # Number of full passes through the dataset
batch_size = 16          # Number of samples in each minibatch
learning_rate = 0.001  
#use_cuda=0
use_cuda=torch.cuda.is_available()

# Setup GPU optimization if CUDA is supported
if use_cuda:
    computing_device = torch.device("cuda")
    extras = {"num_workers": 1, "pin_memory": True}
    print("CUDA is supported")
else: # Otherwise, train on the CPU
    computing_device = torch.device("cpu")
    extras = False
    print("CUDA NOT supported")

# Setup the training, validation, and testing dataloaders
train_loader, val_loader, test_loader = create_split_loaders(batch_size,shuffle=False, show_sample=False,extras=extras)

# Instantiate a BasicCNN to run on the GPU or CPU based on CUDA support
model = LSTM(in_dim=94, out_dim=94,hid_dim=100,batch_size=16,no_layers=1)
model = model.to(computing_device)
print("Model on CUDA?", next(model.parameters()).is_cuda)

#TODO: Define the loss criterion and instantiate the gradient descent optimizer
criterion = torch.nn.CrossEntropyLoss() #TODO - loss criteria are defined in the torch.nn package

#TODO: Instantiate the gradient descent optimizer - use Adam optimizer with default parameters
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

CUDA is supported
[27 81 82 ... 67 76 66]
[81 82 63 ... 76 66 29]
[27 81 82 ... 67 76 66]
[81 82 63 ... 76 66 29]
[27 81 82 ... 67 76 66]
[81 82 63 ... 76 66 29]
Model on CUDA? True


In [None]:
# Track the loss across training
total_loss = []
avg_minibatch_loss = []
# In piazza @484 they said hidden state should be continues through batches.
#initialise hidden layers

hidden=None

# Begin training procedure
for epoch in range(num_epochs):

    N = 50
    N_minibatch_loss = 0.0   
    

    # Get the next minibatch of images, labels for training
    for minibatch_count, (images, labels) in enumerate(train_loader, 0):
        if images.size()!=torch.Size([16, 100, 94]):
            temp=16-images.size()[0]
            a=np.array([93])
            b=np.squeeze(np.eye(94)[a.reshape(-1)])
            c=np.tile(b,(temp,100,1))
            images=torch.cat((torch.tensor(c).float(),images.float()))
            labels=torch.cat((torch.tensor(np.full((temp,100), [93])).float(),labels.float()))
            labels=labels.long()
        images=images.permute(1,0,2)
        
        # Put the minibatch data in CUDA Tensors and run on the GPU if supported
        images, labels = images.to(computing_device), labels.to(computing_device)
        images=images.float()
        labels=labels
        
        # Zero out the stored gradient (buffer) from the previous iteration
        optimizer.zero_grad()
        

        # Perform the forward pass through the network and compute the loss
        outputs,hidden = model(images,hidden)
        outputs=outputs.permute(1,0,2)
        
#         outputs.shape => batchSize * sequenceSize *dictionarySize
#         labels.shape => batchSize * sequenceSize
        loss = criterion(outputs.contiguous().view(outputs.shape[0]*100, 94),labels.contiguous().view(-1))
        
        # Automagically compute the gradients and backpropagate the loss through the network
        loss.backward(retain_graph=True)

        # Update the weights
        optimizer.step()

        # Add this iteration's loss to the total_loss
        total_loss.append(loss.item())
        N_minibatch_loss += loss
        
        #TODO: Implement cross-validation
        
        if minibatch_count % N == 0:    
            
            # Print the loss averaged over the last N mini-batches    
            N_minibatch_loss /= N
            print('Epoch %d, average minibatch %d loss: %.3f' %
                (epoch + 1, minibatch_count, N_minibatch_loss))
            
            # Add the averaged loss over N minibatches and reset the counter
            avg_minibatch_loss.append(N_minibatch_loss)
            N_minibatch_loss = 0.0
        
    print("Finished", epoch + 1, "epochs of training")
print("Training complete after", epoch, "epochs")

Epoch 1, average minibatch 0 loss: 0.076
Epoch 1, average minibatch 50 loss: 3.447
Epoch 1, average minibatch 100 loss: 3.515
Epoch 1, average minibatch 150 loss: 3.517
Epoch 1, average minibatch 200 loss: 3.412
Finished 1 epochs of training
Epoch 2, average minibatch 0 loss: 0.074
