In [2]:
import transformers
import datasets
from transformers.pipelines.pt_utils import KeyDataset
from pathlib import Path
from src.model.train import compute_metrics
import numpy as np
import json
import collections 

# This is a mock training process

A pre-trained model is downloaded from huggingface model hub 
and then saved as is to the model directory.

This model is then used to generate predictions on the test data.
The predictions are used to evaluate the model's performance.
The results are saved to the models directory.

In [3]:
work_dir = Path(".").absolute().parent
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model_dir = work_dir / "models" / model_name
model = transformers.pipeline("text-classification", model=model_name, tokenizer=model_name)
model.model.save_pretrained(model_dir)
model.tokenizer.save_pretrained(model_dir)
print(f"Saved model and tokenizer to {model_dir}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Saved model and tokenizer to /Users/shestaka/tweet_sentiment_classification/models/cardiffnlp/twitter-roberta-base-sentiment-latest


In [4]:
dataset = datasets.load_from_disk(work_dir / "data/processed/test")
class_label = dataset.features['label']

test_results = []

for batch in model(KeyDataset(dataset, "text"), batch_size=8):
    test_results.append(batch)

results = datasets.Dataset.from_list(test_results).rename_column("label", "pred_label")
results = datasets.concatenate_datasets(
    [dataset, results], axis=1
)

predictions = np.asarray(class_label.str2int(results["pred_label"]))
label_ids = results["label"]

eval_predictions = transformers.EvalPrediction(predictions=predictions, label_ids=label_ids)
metrics = compute_metrics(eval_predictions)


renamed_metrics = collections.defaultdict(dict)
prefix = "test_"
for metric, values in metrics.items(): 
    for key, val in values.items():
        if key.startswith("label_"):
            label_id = int(key.split("_")[1])
            label = class_label.int2str(label_id)
            renamed_metrics[f"{prefix}{metric}"][label] = val
        else:
            renamed_metrics[f"{prefix}{metric}"][key] = val
renamed_metrics = dict(renamed_metrics)
print(renamed_metrics)

with open(model_dir  / "test_results.json", "w") as f:
    json.dump(renamed_metrics, f, indent=4)

{'test_precision': {'negative': 0.6198347107438017, 'neutral': 0.775, 'positive': 0.6859756097560976, 'micro_avg': 0.6695437731196054, 'macro_avg': 0.693603440166633, 'weighted_avg': 0.6958899452722606}, 'test_recall': {'negative': 0.872093023255814, 'neutral': 0.32978723404255317, 'positive': 0.8302583025830258, 'micro_avg': 0.6695437731196054, 'macro_avg': 0.6773795199604643, 'weighted_avg': 0.6695437731196054}, 'test_f1_score': {'negative': 0.7246376811594203, 'neutral': 0.4626865671641791, 'positive': 0.7512520868113522, 'micro_avg': 0.6695437731196054, 'macro_avg': 0.6461921117116506, 'weighted_avg': 0.6424456833604258}, 'test_support': {'negative': 258.0, 'neutral': 282.0, 'positive': 271.0, 'micro_avg': nan, 'macro_avg': nan, 'weighted_avg': nan}}
