In [1]:
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import torch
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# model_path = "./pretrained_llms/Llama-3.3-70B-Instruct"
model_path = "./pretrained_llms/Llama-3.1-8B-Instruct"
data_path = "./data"
data_name = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"


In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          add_eos_token=True,
                                          cache_dir=cache_dir)
if tokenizer.pad_token_id is None:
    print("No pad token found, setting pad token to eos token")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "right"
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    
classifier = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer, device=device, padding=True, truncation=True, max_length=512)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./pretrained_llms/Llama-3.1-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda


No pad token found, setting pad token to eos token


In [4]:
dataset = load_dataset(data_name, cache_dir=data_path, split='train[10:20]')

In [5]:
for out in classifier(KeyDataset(dataset, "text"), batch_size=8):
    print(out)

{'label': 'LABEL_1', 'score': 0.5922744274139404}
{'label': 'LABEL_1', 'score': 0.7167643308639526}
{'label': 'LABEL_1', 'score': 0.9328629970550537}
{'label': 'LABEL_1', 'score': 0.628308117389679}
{'label': 'LABEL_2', 'score': 0.9498377442359924}
{'label': 'LABEL_0', 'score': 0.8750986456871033}
{'label': 'LABEL_1', 'score': 0.9951352477073669}
{'label': 'LABEL_1', 'score': 0.9967533946037292}
{'label': 'LABEL_1', 'score': 0.9617916345596313}
{'label': 'LABEL_2', 'score': 0.6502686738967896}


In [8]:
tokens = classifier.tokenizer(["Example text", "I am a boy"], padding=True, truncation=True)
print(tokens)
print(classifier.tokenizer.pad_token)

{'input_ids': [[128000, 13617, 1495, 128009, 128009], [128000, 40, 1097, 264, 8334]], 'attention_mask': [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]}
<|eot_id|>
