In [1]:
from tqdm.notebook import tqdm
import transformers as tr
import os

model_name='EleutherAI/gpt-neo-125M'

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994'
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

bos_token='<|endoftext|>'
eos_token='<|endoftext|>'
pad_token='<|pad|>'

In [2]:
import torch

print('cuda ',torch.version.cuda,
      '\ndevice ', torch.cuda.get_device_name(0))

cuda  11.1 
device  Tesla T4


In [3]:
tokenizer = tr.GPT2Tokenizer.from_pretrained(model_name,    
                            bos_token=bos_token,
                            eos_token=eos_token,
                            pad_token=pad_token,)
model = tr.GPTNeoForCausalLM.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50258, 768)

In [4]:
data=['bla blablabla bla bla']*30

In [5]:
max_length = max([len(tokenizer.encode(_)) for _ in tqdm(data)])
max_length

  0%|          | 0/30 [00:00<?, ?it/s]

10

In [6]:
from torch.utils.data import Dataset

class PythonDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in tqdm(txt_list):
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer( bos_token+txt +    
                                        eos_token,
                                        truncation=True,
                                        max_length=max_length, 
                                        padding='max_length',
                                      )
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx],self.attn_masks[idx]


data_collator = tr.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

dataset = PythonDataset(data, tokenizer, max_length)

  0%|          | 0/30 [00:00<?, ?it/s]

In [7]:
len_dataset=len(dataset)
print(len_dataset)
train_size = int(0.9 * len_dataset)

train_dataset, val_dataset = torch.utils.data.dataset.random_split(dataset, [train_size, len_dataset - train_size])
print(len(train_dataset),len(val_dataset))

30
27 3


In [8]:
save_dir='./results'
training_args = tr.TrainingArguments(output_dir=save_dir, num_train_epochs=5, logging_steps=300, save_steps=300,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,warmup_steps=50,
                                     learning_rate=0.001,adam_epsilon=1e-06,fp16=True,
                                  weight_decay=0.01, logging_dir=f'{save_dir}/logs', deepspeed='./ds_config.json')

[2021-06-29 21:53:00,608] [INFO] [distributed.py:47:init_distributed] Initializing torch distributed with backend: nccl


In [None]:
trainer = tr.Trainer(model=model, 
                     args=training_args,  
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset, 
                  data_collator=lambda data: 
              {'input_ids': torch.stack([f[0] for f in data]),       
               'attention_mask': torch.stack([f[1] for f in data]),
               'labels': torch.stack([f[0] for f in data])}
                    )
# Start training process!
trainer.train()

Using amp fp16 backend


[2021-06-29 21:53:00,650] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1, git-hash=unknown, git-branch=unknown
[2021-06-29 21:53:00,659] [INFO] [utils.py:13:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1


In [None]:
# at this point I cannot interrupt kenrel