In [1]:
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import torch
import os
from datasets import load_dataset
import numpy as np
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# model_path = "./pretrained_llms/Llama-3.3-70B-Instruct"
# model_path = "./pretrained_llms/Llama-3.1-8B"
model_name = "Llama-3.1-8B-Instruct"
model_path = os.path.join("./pretrained_llms", model_name)
data_path = "./data"
data_name = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"
output_dir="./results"

dataset = load_dataset(data_name, cache_dir=data_path)
# dataset = load_dataset(data_name, cache_dir=data_path, split='train[10:20]')
# dataset = load_dataset(data_name, cache_dir=data_path, split='test')

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          add_eos_token=True,
                                          cache_dir=cache_dir)
if tokenizer.pad_token_id is None:
    print("No pad token found, setting pad token to eos token")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "right"
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id   


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./pretrained_llms/Llama-3.1-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No pad token found, setting pad token to eos token


In [10]:
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation=True)   
# apply tokenizer function on your data
tokenized_data = dataset.map(tokenizer_function, batched=True)

train = tokenized_data['train']
test = tokenized_data['test']

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [11]:
# load the accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  # gradient_accumulation_steps=4,  # Effectively reduces batch size in memory
                                  # per_device_train_batch_size=2,  # Smaller batch size
                                  num_train_epochs=5,                                  
                                  eval_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,     
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [6]:
# training arguments: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments
# set per_device_train_batch_size  per_device_eval_batch_size as 8,
# we will fine tune gpt model for 30 epochs , set the corresponding parameter
training_args = TrainingArguments(
    output_dir="test_trainer",
    #evaluation_strategy="epoch",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=30, 
    save_total_limit=2    
    #gradient_accumulation_steps=4
    )

# set the data_collator and training arguments we defined above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
    )

In [9]:
print(model.config.num_labels)  # Confirm matches actual label count
print(len(np.unique(train['label'])))  # Check actual unique label count

3
3


In [11]:
print(train[0].keys())  # Verify label column exists
print(train[1]['input_ids'])  # Confirm label format

dict_keys(['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'])
[128000, 2100, 2689, 328, 1846, 358, 690, 3194, 499, 1618, 304, 5960, 18842, 12340]


In [None]:
## When training a transformer model,
# it’s common to batch sequences together for more efficient processing.
# However, since sequences might have different lengths, they need to be padded to a common length within each batch.
#The DataCollatorWithPadding class automates this process. 

#define the collator, use DataCollatorWithPadding() with the defined tokenizer above
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
config = GPT2Config()

#define GPT classifier, use 'gpt2' pretrained LLM, we have 3 classes in our dataset 

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
model.config.pad_token_id = model.config.eos_token_id 

In [5]:
classifier = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer, device=device, padding=True, truncation=True, max_length=256)
for out in classifier(KeyDataset(dataset, "text"), batch_size=8):
    print(out)

KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['test', 'train']"

In [None]:
tokens = classifier.tokenizer(["Example text", "I am a boy"], padding=True, truncation=True)
print(tokens)
print(classifier.tokenizer.pad_token)