<a href="https://colab.research.google.com/github/TaykhoomDalal/ECE_C147_Project/blob/FinalCode/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer for EEG

---
## Imports

In [None]:
!pip install torch numpy matplotlib



In [None]:
import torch
import torch.nn as nn

import random
import math
import numpy as np
import matplotlib.pyplot as plt

#TRANSFORMER ENCODER SCRATCH (model opt2)

## utils

In [None]:
import torch, os, time, math, tqdm, random, sys, gzip

import torch.nn.functional as F
import torch.distributions as dist

from torch.utils.tensorboard import SummaryWriter

import numpy as np

def enwik8(path=None, n_train=int(90e6), n_valid=int(5e6), n_test=int(5e6)):
    """
    Load the enwik8 dataset from the Hutter challenge.

    Adapted from https://github.com/openai/blocksparse/blob/master/examples/transformer/enwik8.py

    :param path:
    :param n_train:
    :param n_valid:
    :param n_test:
    :return:
    """
    if path is None:
        path = here('data/enwik8.gz')

    with gzip.open(path) if path.endswith('.gz') else open(path) as file:
        X = np.fromstring(file.read(n_train + n_valid + n_test), dtype=np.uint8)
        trX, vaX, teX = np.split(X, [n_train, n_train + n_valid])
        return torch.from_numpy(trX), torch.from_numpy(vaX), torch.from_numpy(teX)

def sample(lnprobs, temperature=1.0):
    """
    Sample an element from a categorical distribution
    :param lnprobs: Outcome log-probabilities
    :param temperature: Sampling temperature. 1.0 follows the given distribution,
        0.0 returns the maximum probability element.
    :return: The index of the sampled element.
    """

    if temperature == 0.0:
        return lnprobs.argmax()

    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)

    return cd.sample()

def sample_sequence(model, seed, max_context, length=600, temperature=0.5, verbose=False):
    """
    Sequentially samples a sequence from the model, token by token.

    :param model:
    :param seed: The sequence to start with.
    :param length: The total number of characters to sample.
    :param temperature: The sampling temperature.
    :param verbose: If true, the sampled sequence is also printed as it is sampled.

    :return: The sampled sequence, including the seed.
    """

    sequence = seed.detach().clone()

    if verbose: # Print the seed, surrounded by square brackets
        print('[', end='', flush=True)
        for c in seed:
            print(str(chr(c)), end='', flush=True)
        print(']', end='', flush=True)

    for _ in range(length):

        # Input is the tail end of the sampled sequence (as many tokens as the model can handle)
        input = sequence[-max_context:]

        # Run the current input through the model
        output = model(input[None, :])

        # Sample the next token from the probabilitys at the last position of the output.
        c = sample(output[0, -1, :], temperature)

        if verbose:
            print(str(chr(max(32, c))), end='', flush=True)

        sequence = torch.cat([sequence, c[None]], dim=0) # Append the sampled token to the sequence

    print()
    return seed

def sample_batch(data, length, batch_size):
    """
    Takes the data (a single sequence of tokens) and slices out a batch of subsequences to provide as input to the model.

    For each input instance, it also slices out the sequence that is shifted one position to the right, to provide as a
    target for the model.

    :param data: The (training) data. A single vector of tokens represented by integers
    :param length: The length of the subsequences in the batch.
    :param batch_size: The number of subsequences in the batch
    :return: A pair (input, target) of minteger matrices representing the input and target for the model.
    """

    # Sample the starting indices of the sequences to slice out.
    starts = torch.randint(size=(batch_size,), low=0, high=data.size(0) - length - 1)

    # Slice out the input sequences
    seqs_inputs  = [data[start:start + length] for start in starts]
    # -- the start index is the one we just sampled, and the end is exactly 'lentgh' positions after that.
    seqs_target = [data[start + 1:start + length + 1] for start in starts]
    # -- The target is the same sequence as input, except one character ahead (we are asking the model to predict the
    #    next character at each position)

    # We now have two lists of torch vectors, which we can concatenate into matrices of batch_size-by-length
    inputs = torch.cat([s[None, :] for s in seqs_inputs], dim=0).to(torch.long)
    target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long)
    # -- Note that we add a singleton dimenson to each vector, s[None.,:], and then concatenate along that dimension.

    return inputs, target

def mask_(matrices, maskval=0.0, mask_diagonal=True):
    """
    Masks out all values in the given batch of matrices where i <= j holds,
    i < j if mask_diagonal is false

    In place operation

    :param tns:
    :return:
    """

    h, w = matrices.size(-2), matrices.size(-1)

    indices = torch.triu_indices(h, w, offset=0 if mask_diagonal else 1)
    matrices[..., indices[0], indices[1]] = maskval

def d(tensor=None):
    """
    Returns a device string either for the best available device,
    or for the device corresponding to the argument
    :param tensor:
    :return:
    """
    if tensor is None:
        return 'cuda' if torch.cuda.is_available() else 'cpu'
    return 'cuda' if tensor.is_cuda else 'cpu'

def here(subpath=None):
    """
    :return: the path in which the package resides (the directory containing the 'former' dir)
    """
    if subpath is None:
        return os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))

    return os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', subpath))

def contains_nan(tensor):
    return bool((tensor != tensor).sum() > 0)


tics = []


def tic():
    tics.append(time.time())

def toc():
    if len(tics)==0:
        return None
    else:
        return time.time()-tics.pop()

def slice_diag(matrix, l, dv=None):
    """
    Take a batch of attention matrices for relative position encodings and slice out the relevant attentions. These
    are the length l sequences starting at the diagonal

    :param matrix:
    :return:
    """
    if dv is None:
        dv = d(matrix)

    h, w = matrix.size(-2), matrix.size(-1)

    assert w == 2 * l -1, f'(h, w)= {(h, w)}, l={l}'

    rest = matrix.size()[:-2]

    matrix = matrix.view(-1, h, w)
    b, h, w = matrix.size()

    result = matrix.view(b, -1)
    result = torch.cat([result, torch.zeros(b, l, device=dv)], dim=1)
    assert result.size() == (b, 2 * l * l), f'result.size() {result.size()}'

    result = result.view(b, l, 2*l)
    result = result[:, :, :l]

    result = result.view(*rest, h, l)
    return result

# Used for converting between nats and bits
LOG2E = math.log2(math.e)
LOGE2 = math.log(2.0)

def compute_compression(model, data, context, batch_size, verbose=False,
                        tbw:SummaryWriter=None, tok=None, skip=0):


    """
    Compute the _compression_ of a dataset under a model. That is, given a model, in how many bits could we represent
    the dataset. This requires us to turn a given probability distribution into a code for the outcomes.

    See [this video](https://youtu.be/mSneVjDvzNQ) for an explanation.

    :param model: A sequence-to-sequence model that takes as input a (sub) sequence of integers and produces a probability
    distributuion on the output.
    :param data: A singe list of integers representing the  data
    :return: The result of the computation in "bits per byte". That is, how many bits does the compressed representation
    spend on each byte (=ASCII character) of the raw data.
    """

    bits, tot = 0.0, 0
    batch = []
    # Buffer, every time it fills up, we run it through the model
    # --- For the sake of speed we want to process the data in batches. For each token in the data, we make a
    #     prediction based on all the `context` tokens before it. This means that for each subsequence in the batch, we
    #     need to shift the start/end indices ahead by one token.
    #
    #     After we pass the batch through the model, we look at only the probabilities predicted for the last token.

    target_indices = []
    i, ic = 0, 0

    for current in tqdm.trange(skip, data.size(0)) if verbose else range(skip, data.size(0)):

        # `current` is the character which we will ultimately predict

        fr = max(0, current - context)
        to = current + 1

        instance = data[fr:to].to(torch.long) # the subsequence of the data to add to the batch
        # -- slice out an instance of size context + 1 (or shorter at the start of the data)

        # if tok is not None:
        #     print(instance[:-1], tok.decode(instance[:-1]))
        #     print(instance[-1:], tok.decode(instance[-1:]))

        target_indices.append(instance.size(0) - 2) # index of the last element of the input to the model

        if instance.size(0) < context + 1:
            assert skip < context # We shouldn't get here if we skip the first `context` characters

            # the index in the output tensor of the character we want to predict
            # -- It's context + 1, because we clip off the last token as a target

            pad = torch.zeros(size=(context + 1 - instance.size(0),), dtype=torch.long)
            instance = torch.cat([instance, pad], dim=0)
            # -- the first tokens don't have enough tokens preceding them, so we pad them to the right size.

            assert instance.size(0) == context + 1 # all instances should be `context` + 1 long

        if torch.cuda.is_available():
            instance = instance.cuda()

        batch.append(instance[None, :])
        # -- We add a singleton dimension to concatenate along later.

        if len(batch) == batch_size or current == data.size(0) - 1:
            # batch is full or we are at the last instance, run it through the model

            b = len(batch)

            ti = torch.tensor(target_indices) + 1
            all = torch.cat(batch, dim=0)
            inputs = all[:, :-1] # input
            target = all[torch.arange(b), ti]  # target values

            with torch.no_grad():
                if next(model.parameters()).is_cuda:
                    inputs = inputs.cuda()
                output = model(inputs)

            if type(output) != torch.Tensor:
                output = torch.log_softmax(output.logits, dim=2) # To make the method work for GPT2 models from Huggingface

            assert output.size()[:2] == (b, context), f'was: {output.size()}, should be {(b, context, -1)}'

            lnprobs = output[torch.arange(b, device=d()), target_indices, target]
            log2probs = lnprobs / LOGE2
            # -- The model produces natural logarithms of probabilities, but we need base-2 logarithms of the
            #    probabilities, since these give us bits.

            if tbw is not None:
                for j, lp in enumerate(log2probs):
                    i += 1
                    tbw.add_scalar('compression/bits-per-token', -lp, i)

                    if tok is not None:
                        nc = len(tok.decode(target[j]))
                        ic += nc
                        tbw.add_scalar('compression/bits-per-byte', -lp/nc, ic)

            bits += - log2probs.sum() # Add the bits for each character (the negative log_2 probabilities) to the running total
            batch, target_indices = [], []  # clear the buffer

    if isinstance(bits, torch.Tensor):
        bits = bits.item()

    return bits # total nr of bits used

def estimate_compression(model, data, nsamples, context, batch_size, verbose=False):
    """
    Estimates the compression by sampling random subsequences instead of predicting all characters.

    NB: This doesn't work for GPT-2 style models with super-character tokenization, since the tokens and number of
    characters are mismatched.

    :param model: A sequence-to-sequence model that takes as input a (sub) sequence of integers and produces a probability
    distributuion on the output.
    :param data: A singe list of integers representing the  data
    :return: The result of the computation in "bits per byte". That is, how many bits does the compressed representation
    spend on each byte (=ASCII character) of the raw data.
    """

    bits, tot = 0.0, 0
    batch = []

    # indices of target characters in the data
    gtargets = random.sample(range(data.size(0)), k=nsamples)

    # Buffer, every time it fills up, we run it through the model
    # --- For the sake of speed we want to process the data in batches. For each token in the data, we make a
    #     prediction based on all the `context` tokens before it. This means that for each subsequence in the batch, we
    #     need to shift the start/end indices ahead by one token.
    #
    #     After we pass the batch through the model, we look at only the probabilities predicted for the last token.
    target_indices = []
    for current in tqdm.tqdm(gtargets) if verbose else range(gtargets):
        # current is the character to be predicted

        fr = max(0, current - context)
        to = current + 1

        instance = data[fr:to].to(torch.long) # the subsequence of the data to add to the batch
        # -- slice out an instance of size context + 1 (or shorter at the start of the data)

        target_indices.append(instance.size(0) - 2) # index of the last element of the context

        if instance.size(0) < context + 1:
            # the index in the output tensor of the character we want to predict
            # -- It's context + 1, because we clip off the last token as a target

            pad = torch.zeros(size=(context + 1 - instance.size(0),), dtype=torch.long)
            instance = torch.cat([instance, pad], dim=0)
            # -- the first tokens don't have enough tokens preceding them, so we pad them to the right size.

            assert instance.size(0) == context + 1 # all instances should be `context` + 1 long

        if torch.cuda.is_available():
            instance = instance.cuda()

        batch.append(instance[None, :])
        # -- We add a singleton dimension to concatenate along later.

        if len(batch) == batch_size or current == data.size(0) - 1:
            # batch is full or we are at the last instance, run it through the model

            b = len(batch)

            all = torch.cat(batch, dim=0)
            inputs = all[:, :-1] # input
            target = all[:, -1]  # target values

            with torch.no_grad():
                if next(model.parameters()).is_cuda:
                    inputs = inputs.cuda()
                output = model(inputs)

            if type(output) != torch.Tensor:
                output = torch.log_softmax(output.logits, dim=2) # To make the method work for GPT2 models from Huggingface

            assert output.size()[:2] == (b, context), f'was: {output.size()}, should be {(b, context, -1)}'

            lnprobs = output[torch.arange(b, device=d()), target_indices, target]
            log2probs = lnprobs * LOG2E
            # -- The model produces natural logarithms of probabilities, but we need base-2 logarithms of the
            #    probabilities, since these give us bits.

            bits += - log2probs.sum() # Add the bits for each character (the negative log_2 probabilties) to the running total
            batch, target_indices = [], []  # clear the buffer

    return bits.item() # total nr of bits used

##preprocessing / data augmentation

In [None]:
def normalize(x):
    xNorm = np.zeros_like(x)
    #Get max and mins across all channels through trials and time bins
    trainMaxofChannels = np.max(x, axis=(-1, -3))
    print(trainMaxofChannels.shape)
    trainMaxofChannels = trainMaxofChannels.reshape((22,1))
    trainMinofChannels = np.min(x, axis=(-1, -3))
    trainMinofChannels = trainMinofChannels.reshape((22,1))
    minMaxofChannels = trainMaxofChannels - trainMinofChannels
    # Use prevoius Values to calculate Min Max Normalization
    # Normalizing across each trial
    for i in range(x.shape[0]):
        xNorm[i] = (x[i] - trainMinofChannels)/(trainMaxofChannels - trainMinofChannels)
    return xNorm

def standardize(x):
    xStand = np.zeros_like(x)
    #Get Mean and StDev across all channels through trials and time bins
    trainChannelMean = np.mean(x, axis=(-1, -3))
    trainChannelMean = trainChannelMean.reshape((22,1))
    trainChannelStd = np.std(x, axis=(-1, -3))
    trainChannelStd = trainChannelStd.reshape((22,1))
    # Use prevoius Values to standardize
    # Standardize across each trial
    for i in range(x.shape[0]):
        xStand[i] = (x[i] - trainChannelMean)/trainChannelStd
    return xStand

class FuncList:
  def __init__(self, funcs):
      """
      A list of functions to apply to an object.
      :param funcs: a list of funcs.
      """
      self.funcs = funcs

  def apply(self, x):
      for f in self.funcs:
          x = f(x)
      return x

  def append(self, f):
      self.funcs.append(f)


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def trim_time(x, start_time, end_time) -> np.ndarray:
    '''
    Return trimmed subset of time-steps
    '''
    return x[:, start_time:end_time, :]


def subsample(X, sub=5):
  '''
  Subsamples by averaging every adjacent *sub* samples provided by the parameter
  Resulting length is datapoint length/sub
  '''

  time_length = X.shape[1]
  if time_length % sub != 0:
    raise Exception('Pick a sub that cleanly divises')

  sub_shape = (X.shape[0], X.shape[1]//sub, X.shape[2])
  out = np.zeros(sub_shape)

  for dp in range(0, X.shape[0]): #loop through each datapoint
    out_idx = 0
    for ts in range(0, time_length, sub): #loop through each sub partition
      out[dp][out_idx][:] = np.sum(X[dp][ts:ts+sub][:], axis=0) / sub
      out_idx = out_idx + 1
  
  return out


def duplicate_x(X, dup=2):
  '''
  Reduplicate a value dup times after it appears
  Result length is length*dup
  '''

  time_length = X.shape[1]
  print(time_length)

  dup_shape = (X.shape[0], X.shape[1]*dup, X.shape[2])
  out = np.zeros(dup_shape)

  cur_out = 0
  for dp in range(0, X.shape[0]): #loop through each datapoint

    for ts in range(0, X.shape[1]): #loop thru each time slice in dp
      org = X[dp][ts][:]

      if cur_out >= X.shape[1]*dup:
        break

      for _ in range(0, dup): #loop dup amount of times to paste it in output
        out[dp][cur_out][:] = org
        cur_out = cur_out + 1
      
  
  return out

## self attention

In [None]:
#Self Attention

import torch
from torch import nn
import torch.nn.functional as F

import random, math, sys

class SelfAttention(nn.Module):

  def __init__(self, emb, heads=8, mask=False):
    """
    :param emb:
    :param heads:
    :param mask:
    """

    super().__init__()

    assert emb % heads == 0, f'Embedding dimension ({emb}) should be divisible by nr. of heads ({heads})'

    self.emb = emb
    self.heads = heads
    self.mask = mask

    s = emb // heads
    # - We will break the embedding into `heads` chunks and feed each to a different attention head

    self.tokeys    = nn.Linear(emb, emb, bias=False)
    self.toqueries = nn.Linear(emb, emb, bias=False)
    self.tovalues  = nn.Linear(emb, emb, bias=False)

    self.unifyheads = nn.Linear(emb, emb)

  def forward(self, x):

    b, t, e = x.size()
    h = self.heads
    assert e == self.emb, f'Input embedding dim ({e}) should match layer embedding dim ({self.emb})'

    s = e // h

    keys    = self.tokeys(x)
    queries = self.toqueries(x)
    values  = self.tovalues(x)

    keys    = keys.view(b, t, h, s)
    queries = queries.view(b, t, h, s)
    values  = values.view(b, t, h, s)

    # -- We first compute the k/q/v's on the whole embedding vectors, and then split into the different heads.
    #    See the following video for an explanation: https://youtu.be/KmAISyVvE1Y

    # Compute scaled dot-product self-attention

    # - fold heads into the batch dimension
    keys = keys.transpose(1, 2).contiguous().view(b * h, t, s)
    queries = queries.transpose(1, 2).contiguous().view(b * h, t, s)
    values = values.transpose(1, 2).contiguous().view(b * h, t, s)

    queries = queries / (e ** (1/4))
    keys    = keys / (e ** (1/4))
    # - Instead of dividing the dot products by sqrt(e), we scale the keys and values.
    #   This should be more memory efficient

    # - get dot product of queries and keys, and scale
    dot = torch.bmm(queries, keys.transpose(1, 2))

    assert dot.size() == (b*h, t, t)

    if self.mask: # mask out the upper half of the dot matrix, excluding the diagonal
        mask_(dot, maskval=float('-inf'), mask_diagonal=False)

    dot = F.softmax(dot, dim=2)
    # - dot now has row-wise self-attention probabilities

    # apply the self attention to the values
    out = torch.bmm(dot, values).view(b, h, t, s)

    # swap h, t back, unify heads
    out = out.transpose(1, 2).contiguous().view(b, t, s * h)

    return self.unifyheads(out)


## Transformer block

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, k, heads):
    super().__init__()

    self.attention = SelfAttention(k, heads=heads)

    self.norm1 = nn.LayerNorm(k)
    self.norm2 = nn.LayerNorm(k)
    self.drop = nn.Dropout(p=0.1)

    self.ff = nn.Sequential(
      nn.Linear(k, k*4),
      nn.ReLU(),
      nn.Dropout(p=0.1),
      nn.Linear(k*4, k))

  def forward(self, x):
    attended = self.attention(x)
    x = self.norm1(attended + x)
    
    fedforward = self.ff(x)
    dp = self.drop(fedforward)
    return self.norm2(dp + x)








class BasicBlockCNN(nn.Module):
  def __init__(self, channels):
      """
      A Basic conv block consisting of
      skip ->
      conv
      bn
      relu
      conv
      bn + skip <-
      relu
      :param channels: the number of channels for this basic block
      """
      super(BasicBlockCNN, self).__init__()
      self.conv1 = nn.Conv1d(channels, channels, 3, padding=1)
      self.bn1 = nn.BatchNorm1d(channels)
      self.conv2 = nn.Conv1d(channels, channels, 3, padding=1)
      self.bn2 = nn.BatchNorm1d(channels)
      self.drop = nn.Dropout(p=0.1)

  def forward(self, x):
      out = self.conv1(x)
      out = self.bn1(out)
      out = F.relu(out)
      
      out = self.drop(out)
      
      out = self.conv2(out)
      out = self.bn2(out) + x
      out = F.relu(out)
      return out







class DeepCNNBase(nn.Module):
    def __init__(self):
        super(DeepCNNBase, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels = 22, out_channels = 25,kernel_size=(10,1),padding = 'same')
        self.maxpool1 = nn.MaxPool2d(kernel_size=(3,1),stride = 3)
        self.conv2 = nn.Conv2d(in_channels = 25,out_channels = 50,kernel_size = (10,1),padding = 'same')
        self.conv3 = nn.Conv2d(in_channels = 50,out_channels = 100,kernel_size = (10,1),padding = 'same')
        self.conv4 = nn.Conv2d(in_channels = 100,out_channels = 200,kernel_size = (10,1),padding = 'same')
        self.gru = nn.GRU(input_size = 12, hidden_size = 64, num_layers=1, batch_first = True)
        self.bn1 = nn.BatchNorm2d(25)
        self.bn2 = nn.BatchNorm2d(50)
        self.bn3 = nn.BatchNorm2d(100)
        self.bn4 = nn.BatchNorm2d(200)
        self.dropout = nn.Dropout(p=0.75)
        self.ELU = nn.ELU()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(12800,4)
        #self.linear1 = nn.Linear(2560,4)


        #transformer
        tblocks = []
        t_depth = 2
        t_heads = 1
        t_dim = 12
        t_seq_length = 1000
        
        self.pos_emb = nn.Embedding(t_seq_len, t_dim)

        for i in range(t_depth):
            tblocks.append(TransformerBlock(k=t_dim, heads=t_heads))
        self.tblocks = nn.Sequential(*tblocks)



    def forward(self,x):


        #transformer block
        '''
        b, t, k = x.size() #b, t, k = tokens.size().    64, 200, 64 :: 64, 1000, 22 (b, seq_length, channels)

        # generate position embeddings
        positions = torch.arange(t)
        positions = positions.to('cuda:0')
        positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)
          
        x = x + positions
        x = self.tblocks(x)
        '''

        
        x = np.swapaxes(x, 1, 2)

        x = torch.reshape(x,(x.shape[0],x.shape[1],x.shape[2],1))

        #print(x.shape)

        

        ## Conv Pool Block 1
        x = self.conv1(x)
        x = self.ELU(x)
        x = self.maxpool1(x)
        x = self.bn1(x)
        x = self.dropout(x)
        
        ## Conv Pool Block 2
        x = self.conv2(x)
        x = self.ELU(x)
        x = self.maxpool1(x)
        x = self.bn2(x)
        x = self.dropout(x)
               
        ## Conv Pool Block 3
        x = self.conv3(x)
        x = self.ELU(x)
        x = self.maxpool1(x)
        x = self.bn3(x)
        x = self.dropout(x)
        
        ## Conv Pool Block 4
        x = self.conv4(x)
        x = self.ELU(x)
        x = self.maxpool1(x)
        x = self.bn4(x)
        x = self.dropout(x)

        x = np.squeeze(x)
        device = x.device
        
        # print(x.shape)
        M = x.shape[0]

        #h0 = torch.zeros(self.gru.num_layers, M, self.gru.hidden_size, requires_grad=True).to(device)
        #x, _ = self.gru(x, h0)


        #print('ONE:', x.shape)





        #'''Transformerrrr
        #x = np.swapaxes(x, 1, 2)

        #transformer block
        b, t, k = x.size() #b, t, k = tokens.size().    64, 200, 64 :: 64, 1000, 22 (b, seq_length, channels)

        # generate position embeddings
        positions = torch.arange(t)
        positions = positions.to('cuda:0')
        positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)
          
        x = x + positions
        x = self.tblocks(x)

        #'''




        #print('TWO:', x.shape)
        #assert(False)


        #Flatten
        x = self.flatten(x)
        #print("flatten output: ", x.shape)
        x = self.linear1(x)

        return x



## Transformer

In [None]:
class Transformer(nn.Module):
  def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes):
    super().__init__()

    self.num_tokens = num_tokens
    #self.token_emb = nn.Embedding(num_tokens, k)
    self.pos_emb = nn.Embedding(seq_length, k)

    self.drop = nn.Dropout(p=0.65)


    #cnnc stuff
    self.conv1 = nn.Conv1d(22, 22, 5, stride=1)  # (B, 22, 1000) -> (B, 22, 200)
    self.b1 = BasicBlockCNN(22)  # (B, 22, 200) -> (B, 22, 200)
    self.dd = DeepCNN()

    # The sequence of transformer blocks that does all the 
    # heavy lifting
    tblocks = []
    for i in range(depth):
        tblocks.append(TransformerBlock(k=k, heads=heads))
    self.tblocks = nn.Sequential(*tblocks)

    # Maps the final output sequence to class logits
    self.toprobs = nn.Linear(k, num_classes)

  def forward(self, x):
    """
    :param x: A (b, t) tensor of integer values representing 
              words (in some predetermined vocabulary).
    :return: A (b, c) tensor of log-probabilities over the 
              classes (where c is the nr. of classes).
    """
    # generate token embeddings
    #tokens = self.token_emb(x)

    
   

    #x = self.drop(x)

    


    #print("X SIZE BEFORE:", x.size())

    x = np.swapaxes(x, 1, 2)
    #print("X SIZE BEFORE SWAPPED:", x.size())
    #CNN layer module thing
    #x = self.conv1(x)


    x = self.dd(x)
    

    #print("X SIZE AFTER:", x.size())
    #x = np.swapaxes(x, 1, 2)
    x = self.drop(x)
    
    

    #x = np.swapaxes(x, 1, 2)
    x = self.b1(x)
    x = np.swapaxes(x, 1, 2)


    b, t, k = x.size() #b, t, k = tokens.size()

    # generate position embeddings
    positions = torch.arange(t)
    positions = positions.to('cuda:0')
    positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)
      
    x = x + positions
    x = self.tblocks(x)

    
    # Average-pool over the t dimension and project to class 
    # probabilities
    
    x = self.toprobs(x.mean(dim=1))
    return F.log_softmax(x, dim=1)

#Training loops (our way)

In [None]:
#Random stuff we need
from google.colab import drive
drive.mount('/content/drive')
!git clone https://github.com/TaykhoomDalal/ECE_C147_Project.git
%cd 'ECE_C147_Project'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'ECE_C147_Project' already exists and is not an empty directory.
/content/ECE_C147_Project


In [None]:
#
# TRAIN MODEL FUNCTION (train_model.py)
#
import numpy as np
import torch
import torch.nn as nn
from utils import parse_args_with_config, load_data
from datasets import NpDataset
import models
from torch.utils.tensorboard import SummaryWriter
import time
import os
from torchvision import transforms
from utils import load_data
from torch.utils.data import DataLoader
from train import train, validate


#ARGS
args = {'gpu': 0, 
        
        'dataset_root': '../drive/MyDrive/W2022/ECE C247/Final Proj stuff/data',

        'epochs': 1500,
        'batch_size': 64,
        'lr': 0.01,
        'lr_milestones': [350, 750, 1250],
        'lr_gamma':0.12,

        'num_workers': 2,
        'pin_memory': True,
        'channels_first': True,
        
        'run_name': 'template', 
        'log_root': 'logs/'}

#cuda
if torch.cuda.is_available():
  device = f'cuda:{args["gpu"]}'
else:
  device = None

t_heads = 2
t_depth = 1
t_seq_len = 100

#model = Transformer(k=22, heads=t_heads, depth=t_depth, seq_length=t_seq_len, num_tokens=100, num_classes=4)
model = DeepCNNBase()
model = model.to(device)
#High Val: 46.7
weights_save_file_name = 'transformer_h' + str(t_heads) + '_d' + str(t_depth) + '_l' + str(t_seq_len) + '.pt'



# logging
log_name = args['run_name'] + "_" + str(int(time.time()))
writer = SummaryWriter(os.path.join(args['log_root'], log_name))

# transforms and online data augmentation
transform_train = None
transform_test = None



# load dataset
#'''GUARD
data = load_data(args['dataset_root'])

# data is channels last by default (n, l, c)
# conv nets need channels first ie (n, c, l)
# optionally flip channels here
if args['channels_first']:
    data['X_train_valid'] = np.transpose(data['X_train_valid'], axes=(0, 2, 1))
    data['X_test'] = np.transpose(data['X_test'], axes=(0, 2, 1))

#GUARD'''



#PREPROCESSING
init_instances = len(data['X_train_valid'])

#Trimming (first 500)
'''
trimmed = trim_time(data['X_train_valid'], 500, 1000)
trimmed = duplicate_x(trimmed)
data['X_train_valid'] = np.append(data['X_train_valid'], trimmed, axis=0)
data['y_train_valid'] = np.append(data['y_train_valid'], data['y_train_valid'])
'''

#subsampling (sub 25)
'''
data['X_train_valid'] = subsample(data['X_train_valid'], sub=1000//t_seq_len)
data['X_test'] = subsample(data['X_test'], sub=1000//t_seq_len)
'''



#add noise
'''GUARD
num_instances = len(data['X_train_valid'])
for instance in range(num_instances):
  noise = np.random.normal(0, .1, (1, t_seq_len, 22))
  new_signal = data['X_train_valid'][instance] + noise
  data['X_train_valid'] = np.append(data['X_train_valid'], new_signal, axis=0)
  data['y_train_valid'] = np.append(data['y_train_valid'], data['y_train_valid'][instance])
'''


# create target to index mapping
unique_targets = np.unique(data['y_train_valid'])
offset = np.min(unique_targets)



train_dataset = NpDataset(data['X_train_valid'], data['y_train_valid'] - offset, transform=transform_train, store_as_tensor=True)
test_dataset = NpDataset(data['X_test'], data['y_test'] - offset, transform=transform_test, store_as_tensor=True)

# dataloaders
train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, num_workers=args['num_workers'], pin_memory=args['pin_memory'])
test_loader = DataLoader(test_dataset, batch_size=args['batch_size'], num_workers=args['num_workers'], pin_memory=args['pin_memory'])


device = "cuda:0" if torch.cuda.is_available() else "cpu"


#load onto gpu
for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)
for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)



#GUARD'''
# criterion
criterion = nn.CrossEntropyLoss()
#optimizer
#optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
'''
# lr scheduler
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args['lr_milestones'], args['lr_gamma'])

'''
best_val_acc = 0
best_model = None


In [None]:
data['X_train_valid'].shape

(4230, 1000, 22)

In [None]:
optimizer = torch.optim.SGD(
    [
      {"params": model.tblocks.parameters(), "lr": 0.4},
     {"params": model.conv1.parameters(), "lr": 0.1},
     {"params": model.maxpool1.parameters(), "lr": 0.1},
     {"params": model.conv2.parameters(), "lr": 0.1},
     {"params": model.conv3.parameters(), "lr": 0.1},
     {"params": model.conv4.parameters(), "lr": 0.1},
     {"params": model.gru.parameters(), "lr": 0.1},
     {"params": model.bn1.parameters(), "lr": 0.1},
     {"params": model.bn2.parameters(), "lr": 0.1},
     {"params": model.bn3.parameters(), "lr": 0.1},
     {"params": model.bn4.parameters(), "lr": 0.1},
     {"params": model.ELU.parameters(), "lr": 0.1},
     {"params": model.flatten.parameters(), "lr": 0.1},
     {"params": model.linear1.parameters(), "lr": 0.1},
    ], 
    lr=args['lr']
)

In [None]:
#################################################################################################################################
#Edit Learning Rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
#Assign best Model
model = best_model

In [None]:
#Load .pt model from Drive
t_heads = 2
t_depth = 1
t_seq_len = 25

model = Transformer(k=22, heads=t_heads, depth=t_depth, seq_length=t_seq_len, num_tokens=100, num_classes=4)
ref_path = '../drive/MyDrive/W2022/ECE C247/Final Proj stuff/transformers/checkpoints/'
file_path = ref_path + 'transformer_h' + str(t_heads) + '_d' + str(t_depth) + '_l' + str(t_seq_len) + '.pt'
model.load_state_dict(torch.load(file_path))
model = model.to('cuda:0')
#################################################################################################################################

In [None]:
# train loop stats
for e in range(args['epochs']):
    print("Best Val:", best_val_acc)

    # train
    model.train()
    train_loss, train_acc = train(model, criterion, optimizer, train_loader, e, device="cuda:0")

    # validate
    model.eval()
    val_loss, val_acc = validate(model, criterion, test_loader, e, device=device)

    print("")


    if val_acc >= best_val_acc:
      best_val_acc = val_acc
      best_model = model
      file_name = '../drive/MyDrive/W2022/ECE C247/Final Proj stuff/transformers/checkpoints/' + weights_save_file_name
      torch.save(model.state_dict(), file_name)


    # update learning rate
    #lr_scheduler.step()


    if e % 100 == 0:
      print("\n\n\nLR: ")
      print(get_lr(optimizer))

    '''
    # log stats
    best_val_acc = max(val_acc, best_val_acc)
    writer.add_scalar("loss/train", train_loss, e)
    writer.add_scalar("acc/train", train_acc, e)
    writer.add_scalar("loss/val", val_loss, e)
    writer.add_scalar("acc/val", val_acc, e)
    writer.add_scalar("acc/val_best", best_val_acc, e)
    writer.add_scalar("optim/lr", lr_scheduler.get_last_lr()[0], e)
    '''

# log hyperparams
'''
writer.add_hparams({
    "model": args.model,
    "optimizer": args.optimizer,
    "batch_size": args.batch_size,
    "learning_rate": args.learning_rate,
    "momentum": args.momentum,
    "l2_reg": args.l2_reg,
    "epochs": args.epochs
}, {
    "best_val_acc": best_val_acc
})
'''


Best Val: 0


  self.padding, self.dilation, self.groups)
Train 0:   0%|          | 0/34 [00:00<?, ?it/s]


RuntimeError: ignored