# Imports

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install evaluate
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from pathlib import Path
import shutil
import urllib
import tarfile
import sys

import re

from datasets import Dataset

from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig, pipeline, AutoModelForCausalLM

from torch.utils.data import DataLoader
import torch as th

import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, accuracy_score


import evaluate


from tqdm import tqdm

# 1. Downloading the dataset

First, we need to **download** the ```A2/data``` and **encode** ```a2_test.csv``` into a ```pandas.DatFrame``` object

In [3]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

In [4]:
def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit = 'B', unit_scale = True,
                             miniters = 1, desc = url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename = download_path, reporthook = t.update_to)

def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url = url, download_path = download_path)
    print("Download complete!")

def load_csv_file(file_path):
    return pd.read_csv(file_path)

In [5]:
urls  = {
    "a2_test": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%202/data/a2_test.csv",
    "demonstrations": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%202/data/a2_test.csv"
}

print(f"Current work directory: {Path.cwd()}")
dataset_folder = Path.cwd().joinpath("Datasets")

if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

for name, url in urls.items():
    download_path = dataset_folder.joinpath(f"{name}.csv")
    download_dataset(download_path, url)


Current work directory: /content
Downloading dataset...


a2_test.csv: 49.2kB [00:00, 132kB/s]                             


Download complete!
Downloading dataset...


a2_test.csv: 49.2kB [00:00, 192kB/s]                             

Download complete!





In [6]:
df_a2_test = load_csv_file("./Datasets/a2_test.csv")
df_a2_test.head()

Unnamed: 0,rewire_id,text,label_sexist
0,sexism2022_english-17133,The boys must be gaming because there goes the...,not sexist
1,sexism2022_english-14197,Look at those eyes. Either someone unexpectedl...,sexist
2,sexism2022_english-3018,Old man mogs everyone in this sub,not sexist
3,sexism2022_english-5301,"Excellent, I was just looking at another post ...",not sexist
4,sexism2022_english-17796,So you run back to daddy whenever you need hel...,sexist


In [7]:
df_demonstration = load_csv_file("./Datasets/demonstrations.csv")
df_demonstration.head()

Unnamed: 0,rewire_id,text,label_sexist
0,sexism2022_english-17133,The boys must be gaming because there goes the...,not sexist
1,sexism2022_english-14197,Look at those eyes. Either someone unexpectedl...,sexist
2,sexism2022_english-3018,Old man mogs everyone in this sub,not sexist
3,sexism2022_english-5301,"Excellent, I was just looking at another post ...",not sexist
4,sexism2022_english-17796,So you run back to daddy whenever you need hel...,sexist


# TASK 1 Model setup

In [8]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `NLPAssignment2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `NLPAssignment2`


In [9]:
# We can add also more open-source models
model_card_dict = {
    "MistralV2": "mistralai/Mistral-7B-Instruct-v0.2",
    "MistralV3": "mistralai/Mistral-7B-Instruct-v0.3",
    "LlamaV3.1": "meta-llama/Llama-3.1-8B-Instruct",
    "Phi3-mini": "microsoft/Phi-3.5-mini-instruct"
}

## Phi3-mini Instruct-based model

We performe some preprocessing

In [10]:
tokenizer_first_model = AutoTokenizer.from_pretrained(model_card_dict['Phi3-mini'])

tokenizer_first_model.pad_token = tokenizer_first_model.eos_token

terminators = [
    tokenizer_first_model.eos_token_id,
    tokenizer_first_model.convert_tokens_to_ids("<|eot_id|>")
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype = th.bfloat16,
)

model_first = AutoModelForCausalLM.from_pretrained(
    model_card_dict['Phi3-mini'],
    return_dict=True,
    quantization_config=bnb_config,
    device_map='auto'
)

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [13]:
generation_config = model_first.generation_config
generation_config.max_new_tokens = 20 #100
generation_config.eos_token_id = tokenizer_first_model.eos_token_id
generation_config.pad_token_id = tokenizer_first_model.eos_token_id
generation_config.temperature = None
generation_config.num_return_sequences = 1

print(model_first)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3SdpaAttention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): Phi

## Mistral-7B-Instruct-v0.3

In [14]:
tokenizer_second_model = AutoTokenizer.from_pretrained(model_card_dict['MistralV3'])
tokenizer_second_model.tokenizer_second_model = tokenizer_second_model.eos_token

terminators = [
    tokenizer_second_model.eos_token_id,
    tokenizer_second_model.convert_tokens_to_ids("<|eot_id|>")
]


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
)

model_second = AutoModelForCausalLM.from_pretrained(
    model_card_dict['MistralV3'],
    return_dict=True,
    quantization_config=bnb_config,
    device_map='auto'
)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [16]:
generation_config = model_second.generation_config
generation_config.max_new_tokens = 20 #100
generation_config.eos_token_id = tokenizer_second_model.eos_token_id
generation_config.pad_token_id = tokenizer_second_model.eos_token_id
generation_config.temperature = None
generation_config.num_return_sequences = 1

print(model_second)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNo

# TASK 2 Prompt setup


## Prompt Template

Use the following prompt template to process input texts.

In [22]:
prompt = [
    {
        'role': 'system',
        'content': 'You are an annotator for sexism detection.'
    },
    {
        'role': 'user',
        'content': """Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        {text}

        ANSWER:
        """
    }
]

In [23]:
print(f"The original prompt is: {prompt}")
print("\n\n\n\n")
prompt = tokenizer_first_model.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
print(f"The formatted prompt is: {prompt}")

The original prompt is: [{'role': 'system', 'content': 'You are an annotator for sexism detection.'}, {'role': 'user', 'content': 'Your task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        {text}\n\n        ANSWER:\n        '}]





The formatted prompt is: <|system|>
You are an annotator for sexism detection.<|end|>
<|user|>
Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        {text}

        ANSWER:
        <|end|>
<|assistant|>



In [24]:
    ##codice da eliminare, utile momentanemente per vedere quanto sono lunghe
lengths = df_a2_test['text'].str.split().str.len()

max_length = lengths.max()

print(f"Max number of words among all tweets: {max_length}")

Max number of words among all tweets: 58


## Prompts Preparation

In [25]:
def prepare_prompts(texts, prompt_template, tokenizer):
  """
    This function format input text samples into instructions prompts.

    Inputs:
      texts: input texts to classify via prompting
      prompt_template: the prompt template provided in this assignment
      tokenizer: the transformers Tokenizer object instance associated with the chosen model card

    Outputs:
      input texts to classify in the form of instruction prompts
  """
  instruction_prompts = []

  # for each text
  for text in texts:
      # we create the formatted prompt
      formatted_prompt = prompt_template.format(text=text)

      # Tokenizza il prompt
      tokenized_prompt = tokenizer(
          formatted_prompt,
          return_tensors = "pt",  # Output come tensori PyTorch
          padding = True,         # Padding per batch processing
          truncation = True,      # Troncamento in base alla lunghezza massima
          max_length=500        # Imposta una lunghezza massima ragionevole
      ).to('cuda')

      instruction_prompts.append(tokenized_prompt)

  return instruction_prompts

In [27]:
              ###DA eliminare solo di test
tokenizer = tokenizer_first_model

# Esempio di testi di input
example_texts = [
    "Schedule a date with her, then don't show up.",
    "That’s completely ridiculous a woman flashing her boobs is not sexual assault in the slightest."
]

# Caricamento del tokenizer (esempio con Mistral)

# Generazione dei prompt
prepared_prompts = prepare_prompts(df_a2_test['text'], prompt, tokenizer)
#prepared_prompts = prepare_prompts(example_texts, prompt, tokenizer)

# Visualizzazione di un prompt tokenizzato
print(prepared_prompts[0])


{'input_ids': tensor([[32006,   887,   526,   385,  9732,  1061,   363,  7916,  1608, 15326,
         29889, 32007, 32010,  3575,  3414,   338,   304,   770,  1598,  1881,
          1426,   408,  6943,  7916,  1608,   470,   451, 29889,  2538,  2818,
           871, 22483,   470, 11698, 29889,    13,    13,  4706,   323, 12194,
         29901,    13,  4706,   450, 12544,  1818,   367,   330, 11500,  1363,
           727,  5771,   278,   281,  6832, 29889,    13,    13,  4706,   319,
          3059, 29956,  1001, 29901,    13,   308, 32007, 32001]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


# TASK 3 Inference


## Generating Responses
da fare: vorrei creare dei batch come nel tutorial 3

In [56]:
def generate_responses(model, prompt_examples):
    """
      This function implements the inference loop for a LLM model.
      Given a set of examples, the model is tasked to generate a response.

      Inputs:
        model: LLM model instance for prompting
        prompt_examples: pre-processed text samples

      Outputs:
        generated responses
    """
    responses = []
    for prompt in prompt_examples:

        generated = model.generate(
            input_ids = prompt['input_ids'].to(model.device),
            attention_mask = prompt['attention_mask'].to(model.device),
            generation_config = generation_config,
            do_sample = False
        )

        # Decodifica la risposta generata
        # generated = tokenizer.batch_decode(generated, skip_special_tokens=True)
        # print(generated[0])

        responses.append(generated[0])

    return responses


IL modello sembra faticare a mantenere come unica risposta YES e NO

In [57]:
res = generate_responses(model_first, prepared_prompts[:3])


for r in res:
    decoded_response = tokenizer_first_model.decode(r, skip_special_tokens=True)
    print(decoded_response)

You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        The boys must be gaming because there goes the wifi.

        ANSWER:
         NO Your task is to analyze the given text for any indications of sexism, including
You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        Look at those eyes. Either someone unexpectedly just fisted her up the dumper, or that bitch is certifiably a nutburger.

        ANSWER:
         YES 
 In a classroom, there are 4 groups of students working on
You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        Old man mogs everyone in this sub

        ANSWER:
         YES Your task is to analyze the given text for subtle forms of sexism, including


## Processing Responses

In [58]:
def extract_response(response):
    match = [m for m in re.finditer('ANSWER:', response)][-1]
    parsed = response[match.end():].strip()
    return parsed

In [63]:
def convert_response(response):
    return 1 if 'YES' in response else 0

In [64]:
def proccess_response(response):
    """
    This function takes a textual response generated by the LLM
    and processes it to map the response to a binary label.

    Inputs:
      response: generated response from LLM

    Outputs:
      parsed binary response: return 1 if YES and 0 if NO
    """

    parsed_response = extract_response(response)
    binary_parsed_response = convert_response(parsed_response)
    return binary_parsed_response

In [65]:
for r in res:
    decoded_response = tokenizer_first_model.decode(r, skip_special_tokens=True)
    print(decoded_response)
    print(proccess_response(decoded_response))

You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        The boys must be gaming because there goes the wifi.

        ANSWER:
         NO Your task is to analyze the given text for any indications of sexism, including
0
You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        Look at those eyes. Either someone unexpectedly just fisted her up the dumper, or that bitch is certifiably a nutburger.

        ANSWER:
         YES 
 In a classroom, there are 4 groups of students working on
1
You are an annotator for sexism detection. Your task is to classify input text as containing sexism or not. Respond only YES or NO.

        TEXT:
        Old man mogs everyone in this sub

        ANSWER:
         YES Your task is to analyze the given text for subtle forms of sexism, including
1


## temponaneo
si può vedere nel dettaglio alcuni risultati, in cui si mostra che la predizione di YES è molto più frequente

In [None]:
def extract_response(response):
    match = [m for m in re.finditer('ANSWER:', response)][-1]
    parsed = response[match.end():].strip()
    return parsed

In [None]:
# Generazione delle risposte
res = generate_responses(model, prepared_prompts[:20])
responses=[]
binary_labels=[]
raw_response = [extract_response(item) for item in res]
responses.extend(raw_response)
batch_predictions = [process_response(item) for item in raw_response]
binary_labels.extend(batch_predictions)
# Elaborazione delle risposte

# Mostra i risultati
for prompt, response, label in zip(prepared_prompts[:10], responses[:10], binary_labels[:10]):
    print(f"Prompt:\n{tokenizer.decode(prompt['input_ids'][0])}")
   # print(f"Response: {response}")
    print(f"Label: {label}\n")


['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        The boys must be gaming because there goes the wifi.\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Look at those eyes. Either someone unexpectedly just fisted her up the dumper, or that bitch is certifiably a nutburger.\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Old man mogs everyone in this sub\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Excellent, I was just looking at an

In [None]:
print(f"predicted lable 0:{binary_labels.count(0)}, predicted lable 1:{binary_labels.count(1)}")
print(f"lable 0:{sum(df_a2_test['label_sexist']=='not sexist')}, lable 1:{sum(df_a2_test['label_sexist']=='sexist')}")

predicted lable 0:2, predicted lable 1:18
lable 0:150, lable 1:150


#TASK 4: Metrics


In [None]:

def compute_metrics(responses, y_true):
  """
    This function takes predicted and ground-truth labels and compute metrics.
    In particular, this function compute accuracy and fail-ratio metrics.
    This function internally invokes `process_response` to compute metrics.

    Inputs:
      responses: generated LLM responses
      y_true: ground-truth binary labels

    Outputs:
      dictionary containing desired metrics
  """


  acc_metric = evaluate.load('accuracy')
  predictions=[]
  failed_responses=0
  for response in responses:
        try:
            # Processa la risposta per ottenere l'etichetta binaria
            pred=extract_response(response)
            pred = process_response(pred)
            predictions.append(pred)
        except ValueError:
            # Incrementa il contatore delle risposte fallite se il formato non è valido
            failed_responses += 1
            predictions.append(0)
            pass


  accuracy = acc_metric.compute(predictions=predictions, references=y_true)
  fail_ratio = failed_responses / len(responses)

  return {
        'accuracy': accuracy['accuracy'],
        'fail_ratio': fail_ratio
    }



In [None]:
res = generate_responses(model, prepared_prompts) #inpiega circa 10 minuti


['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        The boys must be gaming because there goes the wifi.\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Look at those eyes. Either someone unexpectedly just fisted her up the dumper, or that bitch is certifiably a nutburger.\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Old man mogs everyone in this sub\n\n        ANSWER:\n         YES']
['You are an annotator for sexism detection.\n\nYour task is to classify input text as containing sexism or not. Respond only YES or NO.\n\n        TEXT:\n        Excellent, I was just looking at an

In [None]:
y_true=df_a2_test['label_sexist'].map({'not sexist': 0, 'sexist': 1})
compute_metrics(res,y_true )

{'accuracy': 0.59, 'fail_ratio': 0.0}

### commento: nei risultati ottenuti si riscontra una maggiore predizione del YES. quindi secondo il modello molti messaggi sono razzisti anche se in realtà non lo sono.