In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datetime import datetime
# import torch
import json
import os
from datasets import load_dataset
import numpy as np
import evaluate

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
model_name = "Llama-3.1-8B-Instruct"
# model_name = "Llama-3.3-70B-Instruct"

model_path = os.path.join("./pretrained_llms", model_name)
data_path = "./data"
data_name = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"
output_dir="./results"

dataset = load_dataset(data_name, cache_dir=data_path)
# dataset = load_dataset(data_name, cache_dir=data_path, split='train[10:20]')
# dataset = load_dataset(data_name, cache_dir=data_path, split='test')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          add_eos_token=True,
                                          cache_dir=cache_dir)

if tokenizer.pad_token_id is None:
    print("No pad token found in tokenizer, setting pad token to eos token")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "right"

if model.config.pad_token_id is None:
    print("No pad token found in model, setting pad token to eos token of tokenizer")
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.padding_side = "right"
    model.config.use_cache = False  # This can help with training stability
    model.resize_token_embeddings(len(tokenizer))


In [None]:
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation=True)   
# apply tokenizer function on your data
tokenized_data = dataset.map(tokenizer_function, batched=True)

# train = tokenized_data['train'].select(range(10000))
train = tokenized_data['train']
test = tokenized_data['test']

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
#Verify the tokenizer settings:
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Model pad token ID: {model.config.pad_token_id}")

In [None]:
# # load the accuracy metric
# metric = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Compute individual metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_score(labels, predictions, average="weighted", zero_division=0)
    recall = recall_score(labels, predictions, average="weighted")
    f1 = f1_score(labels, predictions, average="weighted")
    
    # Return all metrics
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=1e-5,  # Experiment with different rates
    # lr_scheduler_type="linear",  # Add learning rate scheduling
    # warmup_steps=100,  # Implement learning rate warmup
    optim="adamw_torch",
    weight_decay=0.01,
    num_train_epochs=10,
    save_strategy='steps',
    save_steps=500,   
    eval_strategy='steps',
    logging_steps=250,
    load_best_model_at_end=True,
    save_total_limit=1,
    # eval_steps=50,
    # gradient_accumulation_steps=4,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,   
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
save_model_path = os.path.join("./saved_model", model_name)
trainer.save_model(save_model_path)
tokenizer.save_pretrained(save_model_path)
print(f"Fine-tuned model saved to: {save_model_path}")


In [None]:
# trainer.evaluate(eval_dataset=train) #evaluate train dataset
eval_metrics = trainer.evaluate(eval_dataset=test) #evaluate test dataset

In [4]:
# load saved model
model = AutoModelForSequenceClassification.from_pretrained(
    "./saved_model/Llama-3.1-8B-Instruct",
    num_labels=3,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
    "./saved_model/Llama-3.1-8B-Instruct", add_eos_token=True, cache_dir=cache_dir
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OSError: Can't load tokenizer for './saved_model/Llama-3.1-8B-Instruct'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './saved_model/Llama-3.1-8B-Instruct' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [None]:
def save_metrics(metrics, model_name, data_name, base_dir="./results"):
    """
    Save metrics to JSON file with automatic filename generation and collision handling.
    """
    # Create results directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Create base filename
    dataset_prefix = data_name.split('/')[-1][:5]
    base_filename = f"fine-tuned_{model_name}_{dataset_prefix}_metrics"
    
    # Generate filename with collision handling
    filename = f"{base_filename}.json"
    filepath = os.path.join(base_dir, filename)
    
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.json"
        filepath = os.path.join(base_dir, filename)
    
    # Save to JSON
    with open(filepath, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"Metrics saved to: {filepath}")
    
    return filepath


In [None]:
save_metrics(eval_metrics, model_name, data_name)

In [None]:
predictions = trainer.predict(test)
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=1e-5,  # Experiment with different rates
    # lr_scheduler_type="linear",  # Add learning rate scheduling
    # warmup_steps=100,  # Implement learning rate warmup
    optim="adamw_torch",
    # weight_decay=0.01,
    num_train_epochs=5,
    eval_strategy="epoch",
    # eval_steps=50,
    # gradient_accumulation_steps=4,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [3]:
id = 3
print(predictions.label_ids[id])
print(train['label'][id])
print(train['text'][id])

NameError: name 'predictions' is not defined

In [None]:
classifier = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer, device_map="auto", padding=True, truncation=True, max_length=256)
for out in classifier(KeyDataset(train, "text"), batch_size=8):
    print(out)

In [None]:
# training arguments: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments
# set per_device_train_batch_size  per_device_eval_batch_size as 8,
# we will fine tune gpt model for 30 epochs , set the corresponding parameter
training_args = TrainingArguments(
    output_dir="test_trainer",
    #evaluation_strategy="epoch",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=30, 
    save_total_limit=2    
    #gradient_accumulation_steps=4
    )

# set the data_collator and training arguments we defined above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
    )

In [None]:
classifier = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer, device=device, padding=True, truncation=True, max_length=256)
for out in classifier(KeyDataset(dataset, "text"), batch_size=8):
    print(out)

In [None]:
print(model.config.num_labels)  # Confirm matches actual label count
print(len(np.unique(train['label'])))  # Check actual unique label count

In [None]:
print(train[0].keys())  # Verify label column exists
print(train[1]['input_ids'])  # Confirm label format

In [None]:
## When training a transformer model,
# it’s common to batch sequences together for more efficient processing.
# However, since sequences might have different lengths, they need to be padded to a common length within each batch.
#The DataCollatorWithPadding class automates this process. 

#define the collator, use DataCollatorWithPadding() with the defined tokenizer above
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
config = GPT2Config()

#define GPT classifier, use 'gpt2' pretrained LLM, we have 3 classes in our dataset 

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
model.config.pad_token_id = model.config.eos_token_id 

In [None]:
tokens = classifier.tokenizer(["Example text", "I am a boy"], padding=True, truncation=True)
print(tokens)
print(classifier.tokenizer.pad_token)