In [1]:
from datasets import load_dataset
import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
model_name = "Qwen/Qwen2-0.5B"
dataset = load_dataset('imdb', 'plain_text')

def preprocess(example):
    words = example['text'].split()
    prefix = ' '.join(words[:5])
    return {'prompt':prefix, 'label':example['label']}

#load dataset, and preprocess to create a prompt with first 5 words
dataset = dataset.map(preprocess, remove_columns=['text'])

In [2]:
# prepare the model for supervised fine-tuning

#Step 1: load the tokenizer for the model, and set the pad token to eos as Qwen does not have a pad token
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_name, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token

#Step 2: Load & config the model
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path = model_name, num_labels = 2, 
                                           trust_remote_code = True)
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  #dynamically pad to the longest sequence in each mini-batch

#Step 3: Transform the dataset into tokenized format
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, max_length = 128, padding = False)
tokenized_dataset = dataset.map(tokenize_function, batched = True).rename_column("label", "labels")
tokenized_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])

#Step 4: Set up training arguments 
training_args = TrainingArguments(
    output_dir="./Qwen2-0.5B-finetuned-imdb",
    eval_strategy="epoch",
    save_strategy="epoch",  
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,  # every GPU processes one mini-batch at one time, and the cudas in each GPU cooperate on the same mini-batch to speed up
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,  #L2 regularization strength 
    load_best_model_at_end=True,
    bf16 = True
)

#Step 5: Set up trainer object 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

#Step 6: train and save the model, save tokenizer togther with the model config in case tokenizer is changed during training 
trainer.train()
trainer.save_model("./Qwen2-0.5B-finetuned-imdb")
tokenizer.save_pretrained("./Qwen2-0.5B-finetuned-imdb")


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import torch, os

torch.set_num_threads(os.cpu_count())        # intra-op parallelism
torch.set_num_interop_threads(4)             # thread pools that launch ops
print(torch.__config__.parallel_info())

ATen/Parallel:
	at::get_num_threads() : 8
	at::get_num_interop_threads() : 4
OpenMP 201811
	omp_get_max_threads() : 8
MKLDNN not found
std::thread::hardware_concurrency() : 8
Environment variables:
	OMP_NUM_THREADS : [not set]
ATen parallel backend: OpenMP

