In [1]:
import json

In [2]:
import datasets
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [3]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForMaskedLM

In [4]:
model_name = 'cointegrated/rubert-tiny'

In [14]:
def start_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=312, padding='max_length')



alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMOPQRSTUVWXYZ<>^[-]()_%*""\\\n'
num = '0123456789'
def clear_words(sent):
    new_sent = sent
    for i in new_sent:
        if i in alphabet:
            new_sent = new_sent.replace(i, '')
        elif i in num:
            return ''
    if len(new_sent) == 0:
        return ''
    if new_sent[0] == ' ':
        return new_sent[1:] 
    else:
        return new_sent

In [8]:
with open('anekdot.json', encoding="utf-8") as f:
    anecdot = json.load(f)

In [9]:
print(anecdot)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
text_list = []
for i in range(len(anecdot)):
    text_list.append(anecdot[i]['text'])

In [11]:
print(text_list)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
word = [item for sentence in text_list for item in sentence.split('.') if item != '']

In [13]:
word = list(filter(lambda x: x != '\n', word))

In [15]:
new_words = []
for temp in word:
    if len(temp) > 8:
        new_temp = clear_words(temp)
        if new_temp != '':
            new_words.append(new_temp)
word = new_words

In [16]:
train_anecdot, test_anecdot = train_test_split(word, test_size=0.2)

In [17]:
train_dd = Dataset.from_dict({'text': train_anecdot})
test_dd = Dataset.from_dict({'text': test_anecdot})

In [18]:
anecdot_dd = datasets.DatasetDict({'train': train_dd, 'test': test_dd})

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
tokenizer_anecdot = anecdot_dd.map(start_function, batched=True)

  0%|          | 0/184 [00:00<?, ?ba/s]

  0%|          | 0/46 [00:00<?, ?ba/s]

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
def labels(example):
    example['labels'] = example['input_ids'].copy()
    return example

In [27]:
tokenizer_anecdot = tokenizer_anecdot.map(labels, batched=True)

  0%|          | 0/184 [00:00<?, ?ba/s]

  0%|          | 0/46 [00:00<?, ?ba/s]

In [28]:
model = AutoModelForMaskedLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
model.to('cuda')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=Tr

In [31]:
training_args = TrainingArguments(
    output_dir='D:/Anecdote_BERT/',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.001
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_anecdot["train"],
    eval_dataset=tokenizer_anecdot["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [33]:
import torch
torch.cuda.empty_cache()

In [35]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [36]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text.
***** Running training *****
  Num examples = 183378
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 45846


RuntimeError: CUDA out of memory. Tried to allocate 36.00 MiB (GPU 0; 3.00 GiB total capacity; 1.45 GiB already allocated; 28.07 MiB free; 1.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF