In [2]:
!pwd

/home/booydar/Desktop/MIPT/memory_experiments/notebooks


In [3]:
import numpy as np
import torch
import torch.nn.functional as F
from typing import List, Optional, Tuple, Union
from transformers import PreTrainedModel, AutoModelForSequenceClassification

import math

from typing import List, Optional, Tuple, Union
from transformers import BertForSequenceClassification
import transformers
from transformers.modeling_outputs import SequenceClassifierOutput

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-4_H-256_A-4")

In [5]:
num_segments = 2
num_mem_tokens = 10

tokenizer.model_max_length  = (tokenizer.model_max_length - num_mem_tokens) * num_segments
tokenizer.padding_side = 'left'

In [6]:
# def tokenize_function(examples):
#     return tokenizer(examples["input"], padding="max_length", truncation=True)

# tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [7]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [8]:
# gen = iter(small_eval_dataset)
# src = next(gen)

### Finetune

In [9]:
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import sys
sys.path.append('..')
# from modeling_rmt import RMTEncoderForSequenceClassification

In [10]:
import math

class RMTEncoderForSequenceClassification():
    def __init__(self, config=None, base_model=None, **kwargs):
        if config is not None:
            self.model = AutoModelForSequenceClassification(config, **kwargs)
        
        if base_model is not None:
            self.model = base_model


    def from_pretrained(from_pretrained, **kwargs):
        base_model = AutoModelForSequenceClassification.from_pretrained(from_pretrained, **kwargs)
        rmt = RMTEncoderForSequenceClassification(base_model=base_model)
        return rmt
        

    def set_params(self, 
                drop_empty_segments=True,
                sum_loss=False,
                input_size=None, 
                input_seg_size=None, 
                backbone_cls=None,
                num_mem_tokens=0, 
                bptt_depth=-1, 
                pad_token_id=0, 
                eos_token_id=1,
                cls_token_id=101, 
                sep_token_id=102):
        if input_size is not None:
            self.input_size = input_size
        else:
            self.input_size =  self.base_model.embeddings.position_embeddings.weight.shape[0]
        self.input_seg_size = input_seg_size

        self.bptt_depth = bptt_depth
        self.pad_token_id = pad_token_id
        self.cls_token = torch.tensor([cls_token_id])
        self.sep_token = torch.tensor([sep_token_id])
        self.num_mem_tokens = num_mem_tokens
        self.drop_empty_segments = drop_empty_segments
        self.sum_loss = sum_loss
        self.extend_word_embeddings()


    def set_memory(self, memory=None):
        if memory is None:
            mem_token_ids = self.mem_token_ids.to(device=self.device)
            memory = self.base_model.embeddings.word_embeddings(mem_token_ids)
        return memory
    
    def extend_word_embeddings(self):
        vocab_size = self.base_model.embeddings.word_embeddings.weight.shape[0]
        extended_vocab_size = vocab_size + self.num_mem_tokens
        self.mem_token_ids = torch.arange(vocab_size, vocab_size + self.num_mem_tokens)
        self.base_model.resize_token_embeddings(extended_vocab_size)


    def __call__(self, input_ids, **kwargs):
        memory = self.set_memory()
        segmented = self.pad_and_segment(input_ids)

        outputs = []
        for seg_num, segment_data in enumerate(zip(*segmented)):
            input_ids, attention_mask, token_type_ids = segment_data
            if memory.ndim == 2:
                memory = memory.repeat(input_ids.shape[0], 1, 1)
            if (self.bptt_depth > -1) and (len(segmented) - seg_num > self.bptt_depth): 
                memory = memory.detach()

            seg_kwargs = dict(**kwargs)
            if self.drop_empty_segments:

                non_empty_mask = [not torch.equal(input_ids[i], self.empty) for i in range(len(input_ids))]
                if sum(non_empty_mask) == 0:
                    continue
                input_ids = input_ids[non_empty_mask]
                attention_mask = attention_mask[non_empty_mask]
                token_type_ids = token_type_ids[non_empty_mask]
                seg_kwargs['labels'] = seg_kwargs['labels'][non_empty_mask]

                inputs_embeds = self.base_model.embeddings.word_embeddings(input_ids)
                inputs_embeds[:, 1:1+self.num_mem_tokens] = memory[non_empty_mask]
            else:
                inputs_embeds = self.base_model.embeddings.word_embeddings(input_ids)
                inputs_embeds[:, 1:1+self.num_mem_tokens] = memory

            seg_kwargs['inputs_embeds'] = inputs_embeds
            seg_kwargs['attention_mask'] = attention_mask
            seg_kwargs['token_type_ids'] = token_type_ids
            
            out = self.model.forward(**seg_kwargs, output_hidden_states=True)
            outputs.append(out)

            if self.drop_empty_segments:
                memory[non_empty_mask] = out.hidden_states[-1][:, :self.num_mem_tokens]
            else:
                memory = out.hidden_states[-1][:, :self.num_mem_tokens]

        if self.sum_loss:
            out['loss'] = torch.stack([o['loss'] for o in outputs]).sum(dim=-1)

        return out

    def pad_and_segment(self, input_ids):
        
        sequence_len = input_ids.shape[1]
        input_seg_size = self.input_size - self.num_mem_tokens - 3 
        if self.input_seg_size is not None and self.input_seg_size < input_seg_size:
            input_seg_size = self.input_seg_size
            
        n_segments = math.ceil(sequence_len / input_seg_size)

        augmented_inputs = []
        for input in input_ids:
            input = input[input != self.pad_token_id][1:-1]

            seg_sep_inds = [0] + list(range(len(input), 0, -input_seg_size))[::-1] # chunk so that first segment has various size
            input_segments = [input[s:e] for s, e in zip(seg_sep_inds, seg_sep_inds[1:])]

            def pad_add_special_tokens(tensor, seg_size):
                tensor = torch.cat([self.cls_token.to(device=self.device),
                                    self.mem_token_ids.to(device=self.device),
                                    self.sep_token.to(device=self.device),
                                    tensor.to(device=self.device),
                                    self.sep_token.to(device=self.device)])
                pad_size = seg_size - tensor.shape[0]
                if pad_size > 0:
                    tensor = F.pad(tensor, (0, pad_size))
                return tensor

            input_segments = [pad_add_special_tokens(t, self.input_size) for t in input_segments]
            empty = torch.Tensor([]).int()
            self.empty = pad_add_special_tokens(empty, self.input_size)
            empty_segments = [self.empty for i in range(n_segments - len(input_segments))]
            input_segments = empty_segments + input_segments

            augmented_input = torch.cat(input_segments)
            augmented_inputs.append(augmented_input)
            
        augmented_inputs = torch.stack(augmented_inputs)
        attention_mask = torch.ones_like(augmented_inputs)
        attention_mask[augmented_inputs == self.pad_token_id] = 0

        token_type_ids = torch.zeros_like(attention_mask)

        input_segments = torch.chunk(augmented_inputs, n_segments, dim=1)
        attention_mask = torch.chunk(attention_mask, n_segments, dim=1)
        token_type_ids = torch.chunk(token_type_ids, n_segments, dim=1)
    
        return input_segments, attention_mask, token_type_ids


    def to(self, device):
        self.model = self.model.to(device)
        
    
    def cuda(self):
        self.model.cuda()


    def __getattr__(self, attribute):
        return getattr(self.model, attribute)


    def parameters(self, **kwargs):
        return self.model.parameters(**kwargs)

    def named_parameters(self, **kwargs):
        return self.model.named_parameters(**kwargs)


In [11]:
# pretrained_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5, output_hidden_states=True)

In [12]:
# pretrained_model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=5, output_hidden_states=True)
# rmt = RMTEncoderForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=3)
rmt = RMTEncoderForSequenceClassification.from_pretrained('google/bert_uncased_L-4_H-256_A-4', num_labels=3)

Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [13]:
rmt.set_params(
                drop_empty_segments=True,
                sum_loss=False,
                input_size=None, 
                input_seg_size=None, 
                backbone_cls=None,
                num_mem_tokens=0, 
                bptt_depth=-1, 
                pad_token_id=0, 
                eos_token_id=1,
                cls_token_id=101, 
                sep_token_id=102)

### load dataset 

In [14]:
input_seq_len = 512
target_seq_len = 2

In [15]:
encode_plus_kwargs = {'max_length': input_seq_len,
                              'truncation': True,
                              'padding': 'longest',
                              'pad_to_multiple_of': 64}
generate_kwargs = {}
labels_map = {'Contradiction': 0, 'Entailment': 1, 'Not mentioned': 2}
num_labels = len(labels_map)

def collate_fn(batch):
    # cut too long strings because they may slow down tokenization
    inputs = [b['input'][:input_seq_len * 10] for b in batch]
    labels = [b['output'][:target_seq_len * 10] for b in batch]
    features = tokenizer.batch_encode_plus(list(inputs), return_tensors='pt', **encode_plus_kwargs)
    labels = np.array([labels_map[t] for t in labels])
    features['labels'] = torch.from_numpy(labels)
    return features

In [16]:
import datasets
dataset = datasets.load_dataset('tau/scrolls', 'contract_nli')
train_dataset = dataset['train']

Reusing dataset scrolls (/home/booydar/.cache/huggingface/datasets/tau___scrolls/contract_nli/1.0.0/672021d5d8e1edff998a6ea7a5bff35fdfd0ae243e7cf6a8c88a57a04afb46ac)
100%|██████████| 3/3 [00:00<00:00, 67.71it/s]


In [17]:
# shuffle train data each epoch (one loop over train_dataset)
train_sampler = RandomSampler(train_dataset,)
# per_worker_batch_size = args.batch_size * args.gradient_accumulation_steps
# global_batch_size = per_worker_batch_size * hvd.size()
kwargs = {'pin_memory': True, 'num_workers': 0}
train_dataloader = DataLoader(train_dataset, batch_size=2, sampler=train_sampler,
                                collate_fn=collate_fn, **kwargs)

In [18]:
gen = iter(train_dataloader)
sample = next(gen)

  return torch._C._cuda_getDeviceCount() > 0


In [19]:
sample

{'input_ids': tensor([[  101, 18777,  2592,  ...,  2515,  2025,   102],
        [  101,  4909,  2283,  ..., 20141,  1025,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 2])}

In [20]:
rmt.model.config.use_cache, rmt.model.config.is_decoder

(True, False)

In [21]:
out = rmt(**sample)#, use_cache=True)

In [22]:
kwargs = sample.copy()
kwargs.pop('input_ids')

tensor([[  101, 18777,  2592,  ...,  2515,  2025,   102],
        [  101,  4909,  2283,  ..., 20141,  1025,   102]])

In [23]:
# from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# def forward(
#     self_,
#     input_ids: Optional[torch.LongTensor] = None,
#     attention_mask: Optional[torch.FloatTensor] = None,
#     token_type_ids: Optional[torch.LongTensor] = None,
#     position_ids: Optional[torch.LongTensor] = None,
#     head_mask: Optional[torch.FloatTensor] = None,
#     inputs_embeds: Optional[torch.FloatTensor] = None,
#     labels: Optional[torch.LongTensor] = None,
#     output_attentions: Optional[bool] = None,
#     output_hidden_states: Optional[bool] = None,
#     return_dict: Optional[bool] = None,
# ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
#     r"""
#     labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
#         Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
#         config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
#         `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
#     """
#     return_dict = return_dict if return_dict is not None else self_.config.use_return_dict

#     outputs = self_.base_model(
#         input_ids,
#         attention_mask=attention_mask,
#         token_type_ids=token_type_ids,
#         position_ids=position_ids,
#         head_mask=head_mask,
#         inputs_embeds=inputs_embeds,
#         output_attentions=output_attentions,
#         output_hidden_states=output_hidden_states,
#         return_dict=return_dict,
#     )
#     sequence_output = outputs[0]
#     logits = self_.classifier(sequence_output)

#     loss = None
#     if labels is not None:
#         if self_.config.problem_type is None:
#             if self_.num_labels == 1:
#                 self_.config.problem_type = "regression"
#             elif self_.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
#                 self_.config.problem_type = "single_label_classification"
#             else:
#                 self_.config.problem_type = "multi_label_classification"

#         if self_.config.problem_type == "regression":
#             loss_fct = MSELoss()
#             if self_.num_labels == 1:
#                 loss = loss_fct(logits.squeeze(), labels.squeeze())
#             else:
#                 loss = loss_fct(logits, labels)
#         elif self_.config.problem_type == "single_label_classification":
#             loss_fct = CrossEntropyLoss()
#             loss = loss_fct(logits.view(-1, self_.num_labels), labels.view(-1))
#         elif self_.config.problem_type == "multi_label_classification":
#             loss_fct = BCEWithLogitsLoss()
#             loss = loss_fct(logits, labels)

#     if not return_dict:
#         output = (logits,) + outputs[2:]
#         return ((loss,) + output) if loss is not None else output

#     return SequenceClassifierOutput(
#         loss=loss,
#         logits=logits,
#         hidden_states=outputs.hidden_states,
#         attentions=outputs.attentions,
#     )

In [24]:
self = rmt
input_ids = sample['input_ids']

memory = self.set_memory()
segmented = self.pad_and_segment(input_ids)

outputs = []
for seg_num, segment_data in enumerate(zip(*segmented)):
    input_ids, attention_mask, token_type_ids = segment_data
    if memory.ndim == 2:
        memory = memory.repeat(input_ids.shape[0], 1, 1)
    if (self.bptt_depth > -1) and (len(segmented) - seg_num > self.bptt_depth): 
        memory = memory.detach()

    seg_kwargs = dict(**kwargs)
    if self.drop_empty_segments:

        non_empty_mask = [not torch.equal(input_ids[i], self.empty) for i in range(len(input_ids))]
        if sum(non_empty_mask) == 0:
            continue
        input_ids = input_ids[non_empty_mask]
        attention_mask = attention_mask[non_empty_mask]
        token_type_ids = token_type_ids[non_empty_mask]
        seg_kwargs['labels'] = seg_kwargs['labels'][non_empty_mask]

        inputs_embeds = self.base_model.embeddings.word_embeddings(input_ids)
        inputs_embeds[:, 1:1+self.num_mem_tokens] = memory[non_empty_mask]
    else:
        inputs_embeds = self.base_model.embeddings.word_embeddings(input_ids)
        inputs_embeds[:, 1:1+self.num_mem_tokens] = memory

    seg_kwargs['inputs_embeds'] = inputs_embeds
    seg_kwargs['attention_mask'] = attention_mask
    seg_kwargs['token_type_ids'] = token_type_ids
    
    out = self.model.forward(**seg_kwargs, output_hidden_states=True)
    outputs.append(out)

    if self.drop_empty_segments:
        memory[non_empty_mask] = out.hidden_states[-1][:, :self.num_mem_tokens]
    else:
        memory = out.hidden_states[-1][:, :self.num_mem_tokens]

if self.sum_loss:
    out['loss'] = torch.stack([o['loss'] for o in outputs]).sum(dim=-1)

In [25]:
# out = self.model.forward(**seg_kwargs, output_hidden_states=True)
# out

In [26]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    **base_model_forward_kwargs
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    print('base_model_forward_kwargs', base_model_forward_kwargs)

    outputs = self.base_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        **base_model_forward_kwargs
    )

    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)

    loss = None
    if labels is not None:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    ), outputs

In [37]:
self.model.base_model.config.is_decoder = True

In [38]:
out, base_model_outputs = forward(self.model, **seg_kwargs, use_cache=True, output_hidden_states=True)
out.keys(), base_model_outputs.keys()

base_model_forward_kwargs {'use_cache': True}


(odict_keys(['loss', 'logits', 'hidden_states']),
 odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'past_key_values']))

In [75]:
import types

class Dog:
    def bark(self):
        print ("WOOF")

boby = Dog()
boby.bark() # WOOF

def _bark(self):
    print ("WoOoOoF!!")

boby.bark = types.MethodType(_bark, boby)

boby.bark() # WoOoOoF!!


WOOF
WoOoOoF!!


In [73]:
for p in self.base_model.encoder.layer[0].parameters():
    print(p.)

Parameter containing:
tensor([[-0.0021,  0.1765,  0.0537,  ..., -0.0698,  0.0719, -0.0218],
        [ 0.0611, -0.0031,  0.1003,  ...,  0.0282,  0.0014, -0.1518],
        [ 0.0477, -0.1658,  0.0351,  ..., -0.0924,  0.0100,  0.1146],
        ...,
        [ 0.0381,  0.0771, -0.0435,  ..., -0.0303,  0.0879, -0.0117],
        [-0.1050,  0.1055,  0.0615,  ...,  0.1040, -0.0787, -0.0220],
        [-0.0092,  0.0971, -0.0594,  ...,  0.0132,  0.0357, -0.0016]],
       requires_grad=True)
Parameter containing:
tensor([ 0.1300,  0.2067,  0.0079, -0.1088, -0.2239, -0.0473,  0.1249,  0.0819,
        -0.0583,  0.1661,  0.0429,  0.1532, -0.0517,  0.1935, -0.0403, -0.0074,
        -0.0473,  0.2197, -0.1637,  0.1842, -0.0364, -0.0465,  0.0167, -0.0109,
         0.0175, -0.1185,  0.2708, -0.1017, -0.0512, -0.0443,  0.0510,  0.1929,
        -0.0983, -0.1915, -0.0105,  0.1452, -0.2056,  0.0810, -0.0385,  0.0729,
        -0.3731, -0.0409,  0.0393,  0.1148,  0.1744,  0.2859,  0.0982, -0.0658,
         0.1262

In [47]:
# self.base_model.encoder.layer

In [48]:
len(base_model_outputs.past_key_values), base_model_outputs.past_key_values[0].shape, base_model_outputs.past_key_values[1].shape

(4, torch.Size([2, 512, 256]), torch.Size([2, 512, 256]))

In [49]:
len(out['hidden_states']), out['hidden_states'][0].shape, out['hidden_states'][-1].shape

(5, torch.Size([2, 512, 256]), torch.Size([2, 512, 256]))

In [29]:
1/0

ZeroDivisionError: division by zero

In [None]:
# rmt.model.base_model

In [None]:
out.keys()

odict_keys(['loss', 'logits', 'hidden_states'])

In [None]:
# out['hidden_states']

In [None]:
out = forward(self.model, **seg_kwargs, output_hidden_states=True, use_cache=True)
out.keys()

odict_keys(['loss', 'logits', 'hidden_states'])

In [None]:
# 1/0

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(per_device_train_batch_size=2, per_device_eval_batch_size=1, output_dir="test_trainer", evaluation_strategy="epoch", no_cuda=True, max_steps=5)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# augmented_inputs = np.load('augmented_inputs.npy', allow_pickle=True)
# attn_masks = np.load('attention_masks.npy', allow_pickle=True)
# tokenizer.decode(augmented_inputs[0][512:])

In [None]:
trainer = Trainer(
    model=rmt,
    # model=classic_bert,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `RMTEncoderForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RMTEncoderForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 5










[A[A[A[A[A[A[A[A[A[A

(tensor([[  101, 29206, 29207,  ...,     0,     0,     0],
        [  101, 29206, 29207,  ...,     0,     0,     0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]))


  0%|          | 0/5 [21:23<?, ?it/s]
  0%|          | 0/5 [20:56<?, ?it/s]
  0%|          | 0/5 [14:10<?, ?it/s]
  0%|          | 0/5 [11:58<?, ?it/s]
  0%|          | 0/5 [07:14<?, ?it/s]
  0%|          | 0/5 [06:41<?, ?it/s]
  0%|          | 0/5 [04:25<?, ?it/s]
  0%|          | 0/5 [03:41<?, ?it/s]
  0%|          | 0/5 [02:55<?, ?it/s]
  0%|          | 0/5 [02:18<?, ?it/s]
  0%|          | 0/5 [01:49<?, ?it/s]


KeyboardInterrupt: 

In [None]:
sampler = SequentialSampler(small_train_dataset)
dl = DataLoader(small_train_dataset, sampler=sampler, batch_size=4)
gen = dl.__iter__()
s = next(gen)