In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset
from dotenv import load_dotenv
import os
import numpy as np
from pathlib import Path

# Specify your .env file path
dotenv_path = Path("../.env").resolve()

load_dotenv(dotenv_path=dotenv_path.resolve(), override=True)

columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']

base_path = os.getenv('BASE_DATASET_PATH') 

val_df = pd.read_csv(f'{base_path}\\twitter_validation.csv', names=columns, header=None)
 
val_df = val_df.drop(columns=['tweet_id', 'entity'])


labrel_map = {'neutral': 0, 'positive': 1, 'negative': 2}

val_df['label'] = val_df['sentiment'].str.strip().str.lower().map(labrel_map) 
val_df = val_df.replace([np.inf, -np.inf], np.nan)
val_df['label'] = val_df['label'].fillna(0).astype(int)
 
val_ds = Dataset.from_pandas(val_df)
 
val_ds   = val_ds.filter(lambda x: x['label'] is not None)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import pandas as pd

model_dir = "./bert-base-uncased-twitter-sentiment-finetuned-model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
base_model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=3)

model = PeftModel.from_pretrained(base_model, model_dir)
model.config.problem_type = "single_label_classification"


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_function(examples):
    texts = [str(x) if x is not None else "" for x in examples['tweet_content']]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)
 
val_ds   = val_ds.map(tokenize_function,   batched=True, remove_columns=['tweet_content'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

triner = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

eval_result = triner.evaluate(eval_dataset=val_ds)
print(f"Validation results: {eval_result}")

  triner = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return forward_call(*args, **kwargs)


Validation results: {'eval_loss': 0.37805089354515076, 'eval_model_preparation_time': 0.0069, 'eval_accuracy': 0.849, 'eval_f1': 0.8495222428524132, 'eval_runtime': 17.1945, 'eval_samples_per_second': 58.158, 'eval_steps_per_second': 7.27}
