In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Fix Random Seeds
seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [8]:
model_name = r"C:\LLMs\Meta-Llama-3-8B-Instruct"
# model_name = "./Meta-Llama-3-8B-Instruct"
data_path = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"


In [9]:
dataset = load_dataset(data_path)
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          add_eos_token=True,
                                          cache_dir=cache_dir)

Using the latest cached version of the dataset since mteb/tweet_sentiment_extraction couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Philippe\.cache\huggingface\datasets\mteb___tweet_sentiment_extraction\default\0.0.0\62146448f05be9e52a36b8ee9936447ea787eede (last modified on Tue Jun 25 12:02:54 2024).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
print(terminators)

[128009, 128009]


In [11]:
print(
    f"Pad Token id: {tokenizer.bos_token_id} and Pad Token: {tokenizer.bos_token}"
)
print(
    f"EOS Token id: {tokenizer.eos_token_id} and EOS Token: {tokenizer.eos_token}"
)

Pad Token id: 128000 and Pad Token: <|begin_of_text|>
EOS Token id: 128009 and EOS Token: <|eot_id|>


In [None]:
tokenizer.pad_token = tokenizer.eos_token
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation=True)   
# apply tokenizer function on your data
tokenized_data = dataset.map(tokenizer_function, batched=True)

In [2]:
# select train and test data, use 1000 entries for each set
train = tokenized_data['train'].select(range(1000))
test = tokenized_data['test'].select(range(1000))

In [3]:
from transformers import DataCollatorWithPadding, GPT2Config, GPT2ForSequenceClassification
## When training a transformer model,
# it’s common to batch sequences together for more efficient processing.
# However, since sequences might have different lengths, they need to be padded to a common length within each batch.
#The DataCollatorWithPadding class automates this process. 

#define the collator, use DataCollatorWithPadding() with the defined tokenizer above
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
config = GPT2Config()

#define GPT classifier, use 'gpt2' pretrained LLM, we have 3 classes in our dataset 

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
model.config.pad_token_id = model.config.eos_token_id 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import evaluate
import numpy as np

# load the accuracy metric
metric = evaluate.load("accuracy")

def acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
from transformers import TrainingArguments, Trainer

# training arguments: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments
# set per_device_train_batch_size  per_device_eval_batch_size as 8,
# we will fine tune gpt model for 30 epochs , set the corresponding parameter
training_args = TrainingArguments(
    output_dir="test_trainer",
    #evaluation_strategy="epoch",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=30, 
    save_total_limit=2    
    #gradient_accumulation_steps=4
    )

# set the data_collator and training arguments we defined above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator, 
    compute_metrics=acc,
    )

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
# if it outputs memory error: shut down all kernels or restart kernel
# train the model, use train() function
trainer.train()

Step,Training Loss
500,0.6269
1000,0.1044
1500,0.0081
2000,0.0027
2500,0.0015
3000,0.0016
3500,0.0012


TrainOutput(global_step=3750, training_loss=0.09952616239984831, metrics={'train_runtime': 419.4324, 'train_samples_per_second': 71.525, 'train_steps_per_second': 8.941, 'total_flos': 513256776597504.0, 'train_loss': 0.09952616239984831, 'epoch': 30.0})

In [7]:
import evaluate

#evaluate the fine tuned model on test data; use evaluate
trainer.evaluate()

{'eval_loss': 3.218705177307129,
 'eval_accuracy': 0.703,
 'eval_runtime': 3.8726,
 'eval_samples_per_second': 258.225,
 'eval_steps_per_second': 32.278,
 'epoch': 30.0}

In [8]:
from transformers import TextClassificationPipeline

#test with new data, use TextClassificationPipeline to prepare your input text

# This pipeline has a return_all_scores parameter on its __call__ method that allows you to get all scores for each label on a prediction.
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
# pipe your text here
prediction = pipe("I find great pleasure in learning through the NLP courses offered by HLRS.", return_all_scores=True)
prediction



[[{'label': 'LABEL_0', 'score': 2.2688107392809798e-08},
  {'label': 'LABEL_1', 'score': 6.458856660174206e-05},
  {'label': 'LABEL_2', 'score': 0.9999353885650635}]]