In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm


df = pd.read_excel("/content/checking.xlsx")


label_map = {'neutral': 0, 'positive': 1, 'negative': 2}
df['label_num'] = df['sentiment'].str.lower().map(label_map)

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


class HeadlineDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_len=128):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.headlines[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


dataset = HeadlineDataset(
    df["headlines"].tolist(),
    df["label_num"].tolist(),
    tokenizer
)
loader = DataLoader(dataset, batch_size=16)


preds = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        preds.extend(predictions.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())


accuracy = accuracy_score(true_labels, preds)
print(f"\nAccuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(true_labels, preds, target_names=label_map.keys()))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

 11%|█         | 2/19 [00:00<00:05,  3.09it/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

100%|██████████| 19/19 [00:02<00:00,  7.17it/s]


Accuracy: 0.7133

Classification Report:
              precision    recall  f1-score   support

     neutral       0.72      0.58      0.64       100
    positive       0.66      0.76      0.71       101
    negative       0.76      0.80      0.78        92

    accuracy                           0.71       293
   macro avg       0.72      0.72      0.71       293
weighted avg       0.72      0.71      0.71       293




