In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, AutoConfig
from transformers import PhiConfig, PhiForCausalLM

In [4]:
context_length = 256
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
from phi2_dataset import dataset



In [7]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        max_length=context_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_length=True,
    )
    # input_batch = []
    # for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
    #     if length == context_length:
    #         input_batch.append(input_ids)
    return {"input_ids": outputs["input_ids"]}


tokenized_datasets = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [8]:
config = AutoConfig.from_pretrained(
    "microsoft/phi-2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = PhiForCausalLM(config)

In [9]:
from transformers import Trainer, TrainingArguments

In [10]:
args = TrainingArguments(
    output_dir="phi2_ckpt",
    per_device_train_batch_size=4,
    logging_steps=10,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    max_steps=500,
    n
    weight_decay=0.1,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    learning_rate=3e-4,
    save_steps=200,
    fp16=True
)

In [11]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets)

In [12]:
trainer.args.device

device(type='cuda', index=0)

In [13]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.24
200,1.8608
300,1.7123
400,1.6466
500,1.5965


TrainOutput(global_step=500, training_loss=2.2112447204589842, metrics={'train_runtime': 6329.844, 'train_samples_per_second': 1.264, 'train_steps_per_second': 0.079, 'total_flos': 2.2638342410685092e+16, 'train_loss': 2.2112447204589842, 'epoch': 389.0})