## Configure Device

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Import Libs and Modules

In [2]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, AutoConfig
from transformers import PhiConfig, PhiForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [3]:
from phi2_dataset import dataset
from phi2_tokenizer import tokenize_mapper, tokenizer, context_length

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Tokenize dataset and create data collator

In [4]:
tokenized_datasets = dataset.map(tokenize_mapper, batched=True, remove_columns=dataset.column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

## Get the Model configuration and Create a new Instance of Phi2 with random weights

In [5]:
config = PhiConfig.from_json_file("Phi2-Config.json")
config.vocab_size = len(tokenizer)
config.n_ctx=context_length
config.bos_token_id=tokenizer.bos_token_id
config.eos_token_id=tokenizer.eos_token_id

model = PhiForCausalLM(config)

## Create Training Args and Trainer

In [6]:
training_args = TrainingArguments(
    output_dir="phi2_ckpt",
    per_device_train_batch_size=4,
    logging_steps=10,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    max_steps=500,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    learning_rate=3e-4,
    save_steps=200,
    fp16=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets)

## Begin Training

In [7]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,11.5008
20,7.909
30,7.3901
40,6.9285
50,6.782
60,6.6805
70,6.3897
80,6.1442
90,6.0213
100,5.7038


TrainOutput(global_step=500, training_loss=5.113042510986328, metrics={'train_runtime': 5229.2064, 'train_samples_per_second': 1.53, 'train_steps_per_second': 0.096, 'total_flos': 2.716279088211864e+16, 'train_loss': 5.113042510986328, 'epoch': 1.0})