In [1]:
! pip install datasets transformers
! pip install pynvml numba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np

from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch

from sklearn.model_selection import train_test_split

from datasets import DatasetDict, Dataset

In [3]:
def on_gpu(f):
    def wrapper():
        if torch.cuda.is_available():
            return f()
        else:
            print('cuda unavailable')
    return wrapper

In [4]:
if torch.cuda.is_available():
    from pynvml import *
    from numba import cuda

@on_gpu
def print_gpu_utilization():
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    except Exception as e:
        print(e)

@on_gpu
def free_gpu_cache():
    print("Initial GPU Usage")
    print_gpu_utilization()                          

    torch.cuda.empty_cache()

    print("GPU Usage after emptying the cache")
    print_gpu_utilization()

In [5]:
! mkdir output

mkdir: cannot create directory ‘output’: File exists


In [6]:
BASIC_PATH = './'

In [None]:
data = pd.read_csv('./full.csv', header=0)
data

In [8]:
X = data

In [9]:
X_train, X_test = train_test_split(X, test_size=0.15, random_state=42)

In [10]:
MODEL_NAME = 'BlackSamorez/rudialogpt3_medium_based_on_gpt2_2ch'

In [11]:
BOS = '<bos>'
EOS = '<eos>'
CTX = '<ctx>'
RPL = '<rpl>'

SPECIAL_TOKENS = {
    'bos_token': BOS,
    'eos_token': EOS,
    'additional_special_tokens': [CTX, RPL]
}

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
num_new_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))

Embedding(50262, 1024)

In [14]:
def tokenization(example):
    return tokenizer(
        BOS + CTX + example['prompt'] + RPL + example['answer'] + EOS, 
        max_length=1024,
        padding='max_length', 
        truncation=True, 
        add_special_tokens=False,
        # return_tensors='pt'
    )

In [15]:
class TelegramDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for idx, row in data.iterrows():
            encodings_dict = tokenization(row)
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [16]:
def collate(data):
    return {
        'input_ids': torch.stack([f[0] for f in data]),
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[0] for f in data])
    }

def load_data_collator(tokenizer, mlm = False):
    # data_collator = DataCollatorForLanguageModeling(
    #     tokenizer=tokenizer, 
    #     mlm=mlm,
    # )
    return collate

In [17]:
# X_train = X_train[:int(len(X_train) * 0.1)]
# X_test = X_test[:int(len(X_test) * 0.1)]

In [18]:
dataset = DatasetDict({
    'train': TelegramDataset(X_train),
    'test': TelegramDataset(X_test),
})

In [19]:
# dataset = dataset.map(tokenization, batched=True)
# dataset

In [20]:
training_args = TrainingArguments(
    output_dir=f'{BASIC_PATH}/training',
    do_train=True,
    do_eval=True,
    seed=42,
    learning_rate=2e-5,
    weight_decay=1e-2,

    # 'lr_scheduler_type': 'cosine',
    gradient_accumulation_steps=10,
    per_device_train_batch_size=1,
    num_train_epochs=5,
    warmup_steps=0,
    max_grad_norm=1000,
    adam_epsilon=1e-6,

    log_level='debug',
    save_strategy='steps',
    disable_tqdm=False,
    logging_steps=100,
    eval_steps=100,
    save_steps=500,
    resume_from_checkpoint=True,
    per_device_eval_batch_size=2,
    evaluation_strategy='steps',
    fp16=True,
    # metric_for_best_model='loss',
    load_best_model_at_end=True,
    # greater_is_better=False,
    
    save_total_limit=1,
    
    report_to='none',
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=load_data_collator(tokenizer),
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

Using cuda_amp half precision backend


In [None]:
results = trainer.train()

***** Running training *****
  Num examples = 4606
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 10
  Total optimization steps = 2300
  Number of trainable parameters = 355876864


Step,Training Loss,Validation Loss


In [None]:
assert model.transformer.wte.weight.shape[0] == len(tokenizer)

In [22]:
free_gpu_cache()

Initial GPU Usage
GPU memory occupied: 15060 MB.
GPU Usage after emptying the cache
GPU memory occupied: 15060 MB.


In [None]:
results

TrainOutput(global_step=462, training_loss=0.6178220856241333, metrics={'train_runtime': 120.6189, 'train_samples_per_second': 15.288, 'train_steps_per_second': 3.83, 'total_flos': 244269883121664.0, 'train_loss': 0.6178220856241333, 'epoch': 2.0})