In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# from box import Box

import warnings

In [2]:
config = {
    'data_path' : '/content/test.csv',
    'max_len' : 64,
    'hidden_units' : 256, # Embedding size
    'num_heads' : 2, # Multi-head layer
    'num_layers': 2, # block (encoder layer)
    'dropout_rate' : 0.1, # dropout
    'lr' : 0.001,
    'batch_size' : 32,
    'num_epochs' : 4,
    'num_workers' : 2,
    'mask_prob' : 0.15, # for cloze task
    'time_seq' : 500, # time limit for one sequence
    'test_size' : 0.33

}

MAX_LEN = 100

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
#@title Dataset



In [4]:
import random
class MakeDataSet():
  '''
  make data for binary
  classification
  '''
  def __init__(self,config):
    self.dfinit = pd.read_csv(config['data_path'])
    self.df = self.dfinit.sample(frac=0.25, replace=True, random_state=1)
    self.d1 = {i:[] for i in self.df.userId.unique() }
    self.gen_seq = self.generate_seq(config['time_seq'])
    self.user_id_for_neg = []
    self.len_positive = len(self.create_positive())
    self.final_dataframe = self.final_data()
    self.items = len(self.df['movieId'].unique())

  def generate_seq(self,time):
    for user in self.df.userId.unique():
      temp = self.df.loc[self.df['userId']==user].sort_values('timestamp')
      start = temp.iloc[0,-1]
      new =[]
      for index,rows in temp.iterrows():
        new.append(int(rows['movieId']))

        if rows['timestamp']-start > time:#this can be an input parameter defining a session
          self.d1[rows['userId']].append(new)
          new = []
          start = rows['timestamp']
    return self.d1

  def create_positive(self):
    #create dataframe for positive instances
    #first = pd.DataFrame(columns = ['seq1','seq2','label'])
    df = []
    for user in self.gen_seq:
      if len(self.gen_seq[user]) > 1:
        dict_list = []
        for i in range(len(self.gen_seq[user])-1):
          row_dict = {'seq1': self.gen_seq[user][i], 'seq2': self.gen_seq[user][i+1], 'label': 1}
          dict_list.append(row_dict)
      elif len(self.gen_seq[user]) == 1:
        self.user_id_for_neg.append(user)

      first = pd.DataFrame.from_dict(dict_list)
      df.append(first)
    final_df = pd.concat(df, ignore_index=True)

    return final_df



  def create_negative(self):
    #create dataframe for negative instances
    df = []
    # check user ids with 1 seq and combine with any other user


    #randomly generate two user ids such that they are not equal
    #randomly pick any item sets for both
    dict_list = []

    for _ in tqdm(range(self.len_positive)):

      user1 = random.choice(list(self.d1.keys()))

      user2 = random.choice(list(self.d1.keys()))

      if len(self.gen_seq[user1]) > 0 :
        val1 =  self.gen_seq[user1][random.randrange(0,len(self.gen_seq[user1]))]

      else:
        val1 = []

      if len(self.gen_seq[user2]) > 0:
        val2 =  self.gen_seq[user2][random.randrange(0,len(self.gen_seq[user2]))]

      else:
        val2 = []

      row_dict = {'seq1': val1, 'seq2': val2, 'label': 0}
      dict_list.append(row_dict)
    third = pd.DataFrame.from_dict(dict_list)
    df.append(third)

    final_df = pd.concat(df, ignore_index=True)

    return final_df

  def final_data(self):
    t1 = self.create_positive()
    t2 = self.create_negative()
    return pd.concat([t1,t2])

  def train_test_data(self):
    X = self.final_dataframe.iloc[:,0:2]
    y = self.final_dataframe.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = config['test_size'], random_state=42)
    return X_train, X_test, y_train, y_test

  def max_len(self):
    t = self.final_dataframe
    max_len = 0
    for item in t.iloc[:,0]:
      max_len= max(max_len,len(item))
    for _item in t.iloc[:,1]:
      max_len= max(max_len,len(_item))
    return max_len




In [5]:
s1 = MakeDataSet(config)

100%|██████████| 64/64 [00:00<00:00, 114422.62it/s]


In [6]:
s1.final_dataframe

Unnamed: 0,seq1,seq2,label
0,"[804, 804, 2826, 3578, 1473, 1473, 1473, 356, ...",[2174],1
1,[2174],"[260, 2174, 260, 2093, 2161, 367, 3440, 1214, ...",1
2,"[260, 2174, 260, 2093, 2161, 367, 3440, 1214, ...","[2993, 1275, 480, 2058, 2414, 648, 362, 2617, ...",1
3,"[2993, 1275, 480, 2058, 2414, 648, 362, 2617, ...","[2139, 2268, 2985, 2389, 2389, 2387, 1208, 2654]",1
4,"[2139, 2268, 2985, 2389, 2389, 2387, 1208, 2654]","[1278, 3386, 1092, 3273, 163, 163, 1206, 3033,...",1
...,...,...,...
59,"[235, 920, 475]","[344, 344, 165, 595, 480, 434, 208, 253, 110, ...",0
60,"[168, 36, 6, 31, 497, 237, 45, 348, 145, 222, ...","[804, 804, 2826, 3578, 1473, 1473, 1473, 356, ...",0
61,[8907],[30816],0
62,"[310, 839, 212, 536, 880, 835, 848, 999, 359, ...",[],0


In [7]:
import itertools
class BERTDataset(Dataset):
  def __init__(self,data,seq_len = 64,mask_prob = 0.15):
    self.seq_len = seq_len
    self.data = data
    self.label = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
    self.mask_prob = mask_prob


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    # We already have sentence pairs as positive negative
    # This model does not consider cloze task (like Bert4Rec does)
    # Add CLS and SEP tokens to start and end of sentences

    t1,t2,is_next_label = self.data.iloc[index,0],self.data.iloc[index,1],self.data.iloc[index,2]

    t1_random, t1_label = self.random_word(t1,self.mask_prob,self.seq_len)
    t2_random, t2_label = self.random_word(t2,self.mask_prob,self.seq_len)


    t1 = [self.label['[CLS]']] + t1 + [self.label['[SEP]']]
    t2 = t2 + [self.label['[SEP]']]
    t1_label = [self.label['[PAD]']] + t1_label + [self.label['[PAD]']]
    t2_label = t2_label + [self.label['[PAD]']]

    # Combine sequence 1 and 2 as one sequence
    # adding pad token to make the sentence same as seq_len
    segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
    bert_input = (t1 + t2)[:self.seq_len]
    bert_label = (t1_label + t2_label)[:self.seq_len]
    padding = [self.label['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
    bert_input.extend(padding)
    segment_label.extend(padding)
    output = {"bert_input": bert_input,
              "bert_label": bert_label,
              "segment_label": segment_label,
              "is_next": is_next_label}
    #print(output)
    return {key: torch.tensor(value) for key, value in output.items()}
  def random_word(self,user_seq, mask_prob, max_len):

        tokens = []
        labels = []
        for s in user_seq:
            prob = np.random.random()
            if prob < mask_prob:
                prob /= mask_prob
                if prob < 0.8:
                    # masking
                    tokens.append(3)  # mask_index: num_item + 1, 0: pad, 1~num_item: item index
                elif prob < 0.9:
                    # noise
                    tokens.append(random.randrange(s1.items))
                else:
                    tokens.append(s)
                labels.append(s)
            else:
                tokens.append(s)
                labels.append(0)

        mask_len = max_len - len(tokens)
        #tokens = [0] * mask_len + tokens
        #labels = [0] * mask_len + labels
        tokens = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in tokens]))
        labels = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in labels]))

        return tokens,labels#torch.LongTensor(tokens), torch.LongTensor(labels)


In [8]:
traindata = BERTDataset(s1.final_dataframe, config['max_len'])

In [9]:
train_loader = DataLoader(traindata, batch_size=32, shuffle=True, pin_memory=True)


In [10]:
#sample_data = next(iter(train_loader))
print(traindata[random.randrange(len(traindata))])

{'bert_input': tensor([    1,  2105,  2105,  2105,   849,   647,  7991,  2851, 26409,  2018,
            2,  2105,  2105,  2105,   849,   647,  7991,  2851, 26409,  2018,
            2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'bert_label': tensor([   0,    0,    0,    0,  849,    0,    0,    0,    0, 2018,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]), 'segment_label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'is_next': tensor(0)}


# Model Architecture

In [11]:
class PositionalEmbedding(torch.nn.Module):

    def __init__(self, d_model, max_len=128):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        for pos in range(max_len):
            # for each dimension of the each position
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

        # include the batch size
        self.pe = pe.unsqueeze(0)
        # self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe



In [12]:
p = PositionalEmbedding(128,768)

In [13]:
p.pe

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  6.4791e-01,  6.8156e-01,  ...,  1.0000e+00,
           1.3335e-08,  1.0000e+00],
         [ 9.0930e-01, -1.6044e-01,  9.9748e-01,  ...,  1.0000e+00,
           2.6670e-08,  1.0000e+00],
         ...,
         [-9.9975e-01, -9.1578e-01,  9.4656e-01,  ...,  1.0000e+00,
           1.0201e-05,  1.0000e+00],
         [-5.2150e-01, -8.9930e-01,  4.7282e-01,  ...,  1.0000e+00,
           1.0215e-05,  1.0000e+00],
         [ 4.3622e-01, -2.4954e-01, -2.5457e-01,  ...,  1.0000e+00,
           1.0228e-05,  1.0000e+00]]])

In [14]:
class BERTEmbedding(torch.nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)

        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """

        super().__init__()
        self.embed_size = embed_size
        # (m, seq_len) --> (m, seq_len, embed_size)
        # padding_idx is not updated during training, remains as fixed pad (0)
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)



In [15]:
test = traindata[random.randrange(len(traindata))]

In [16]:
s1.items

212

In [17]:
len(test['segment_label'])

64

In [18]:
b = BERTEmbedding(s1.items, 200, seq_len=64, dropout=0.1)

In [19]:
#print(b.position.pe)
#print(b.token)
#print(b.segment)

In [20]:
#b.forward(test['bert_input'],test['segment_label'])

In [21]:
### attention layers
class MultiHeadedAttention(torch.nn.Module):

    def __init__(self, heads, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()

        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)

    def forward(self, query, key, value):
        """
        query, key, value of shape: (batch_size, max_len, d_model)
        """
        # (batch_size, max_len, d_model)
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # (batch_size, max_len, d_model) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))
        # fill 0 mask with super small number so it wont affect the softmax weight
        # (batch_size, h, max_len, max_len)
        #scores = scores.masked_fill(mask == 0, -1e9)
        # (batch_size, h, max_len, max_len)
        # softmax to put attention weight for all non-pad tokens
        # max_len X max_len matrix of attention
        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)

        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        # (batch_size, max_len, d_model)
        return self.output_linear(context)


In [22]:
class FeedForward(torch.nn.Module):
    "Implements FFN equation from the Transformer model."

    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()

        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

In [23]:
class EncoderLayer(torch.nn.Module):
    def __init__(
        self,
        d_model=768,
        heads=12,
        feed_forward_hidden=768 * 4,
        dropout=0.1
        ):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        # residual layer
        interacted = self.layernorm(interacted + embeddings)
        # bottleneck
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [24]:
class BERT(torch.nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads

        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)

        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        # attention masking for padded token
        # (batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)

        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        return x

class NextSentencePrediction(torch.nn.Module):
    """
    2-class classification model : is_next, is_not_next
    """

    def __init__(self, hidden):
        """
        :param hidden: BERT model output size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # use only the first token which is the [CLS]
        return self.softmax(self.linear(x[:, 0]))

In [25]:
class BERT4NIP(torch.nn.Module):
    """
    BERT Language Model
    Next Sentence Prediction Model + Masked Language Model
    """

    def __init__(self, bert: BERT, vocab_size):
        """
        :param bert: BERT model which should be trained
        :param vocab_size: total vocab size for masked_lm
        """

        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)


    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x)

In [26]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

# Training

In [27]:
class BERTTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        test_dataloader=None,
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cpu'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data["bert_label"].view(-1))
            #mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss# + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        )

In [28]:
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
s1.items

212

In [29]:
'''test run'''

train_data = BERTDataset(s1.final_dataframe, config['max_len'])

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model = BERT(
  vocab_size=s1.items,
  d_model=768,
  n_layers=2,
  heads=12,
  dropout=0.1
)

bert_lm = BERT4NIP(bert_model, s1.items)
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')
#bert_trainer.model.to(device)
for epoch in tqdm(range(1, config['num_epochs'] + 1)):
  bert_trainer.train(epoch)



Total Parameters: 14339330


  0%|          | 0/4 [00:00<?, ?it/s]
EP_train:1:   0%|| 0/4 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [5] at entry 0 and [22] at entry 1