## Test models

In [None]:
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Enable GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_name = "tupi-bert-large-portuguese-cased-multiclass-multilabel"
model_tokenizer_path = f"FpOliveira/{model_name}"

# Define features and target
x_name = "text"
label_columns = ['ageism', 'aporophobia', 'body_shame', 'capacitism', 'lgbtphobia', 'political',
                 'racism', 'religious_intolerance', 'misogyny', 'xenophobia', 'other']


# Load and split the dataset into training and validation sets (stratified)
df = pd.read_csv("https://raw.githubusercontent.com/Silly-Machine/TuPi-Portuguese-Hate-Speech-Dataset/main/datasets/tupi_hierarchy.csv")
df['not_hate'] = df[label_columns].apply(lambda row: 1 if row.sum() == 0 else 0, axis=1)
label_columns.append('not_hate')
df = df[['text']+label_columns]
train_texts, val_texts, train_labels, val_labels = train_test_split(df[x_name], df[label_columns], test_size=0.2, random_state=42, stratify=df['not_hate'])
test_dataset = pd.DataFrame({x_name: val_texts})
test_dataset[label_columns] = pd.DataFrame(val_labels[label_columns].values.tolist(), index=test_dataset.index)

# Note: it is necessary to use this particular form of partition to reproduce the original training and testing codes at the time of training

# Define the tokenizer (replace 'bert-base-multilingual-cased' with your desired tokenizer)
tokenizer = BertTokenizer.from_pretrained(model_tokenizer_path)

# Load the pre-trained model and move it to the device
loaded_model = BertForSequenceClassification.from_pretrained(model_tokenizer_path).to(device)
loaded_model.eval()

# Tokenize the test dataset
test_encodings = tokenizer(list(test_dataset[x_name]), truncation=True, padding=True, return_tensors='pt').to(device)
test_labels = torch.tensor(test_dataset[label_columns].values).float().to(device)

# Create PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = CustomDataset(test_encodings, test_labels)

# Test the model on the test dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

test_preds = []
with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader, 1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = loaded_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = (torch.sigmoid(logits) > 0.5).float()  # Thresholding for multilabel classification
        test_preds.extend(preds.cpu().numpy())

# Flatten the true labels and predictions for sklearn metrics
array_test_labels = test_labels.cpu().numpy()
array_test_preds = np.array(test_preds)

# Obtain the number of classes dynamically
num_classes = array_test_labels.shape[1]

# Evaluate the model on the test dataset
test_accuracy = accuracy_score(array_test_labels, array_test_preds)
test_precision = precision_score(array_test_labels, array_test_preds, average='weighted', zero_division=1)
test_recall = recall_score(array_test_labels, array_test_preds, average='weighted', zero_division=1)
test_f1 = f1_score(array_test_labels, array_test_preds, average='weighted', zero_division=1)

# Multilabel confusion matrix and classification report
multilabel_cm = multilabel_confusion_matrix(array_test_labels, array_test_preds)
classification_rep = classification_report(array_test_labels, array_test_preds, target_names=label_columns[:num_classes], zero_division=1)

# Create a pandas DataFrame with the results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [test_accuracy, test_precision, test_recall, test_f1]
})
results_df.to_csv(f'{model_name}_multilabel_evaluation.csv', index=False)

# Print the results
print('\nTest Metrics:')
print(f'Accuracy: {test_accuracy}')
print(f'Precision: {test_precision}')
print(f'Recall: {test_recall}')
print(f'F1 Score: {test_f1}')

# Print the classification report
print('\nClassification Report:')
print(classification_rep)

# Convert the classification report to a DataFrame
classification_report_df = pd.DataFrame(classification_report(array_test_labels, array_test_preds, target_names=label_columns[:num_classes], zero_division=1, output_dict=True)).transpose()

# Save the DataFrame to a CSV file
classification_report_df.to_csv(f'{model_name}_classification_report.csv')

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx]).to(device)



Test Metrics:
Accuracy: 0.8476070528967254
Precision: 0.8535718265186492
Recall: 0.8499724214009928
F1 Score: 0.8494069136225887

Classification Report:
                       precision    recall  f1-score   support

               ageism       0.40      0.13      0.20        15
          aporophobia       0.75      0.19      0.30        16
           body_shame       0.78      0.65      0.71        54
           capacitism       0.50      0.15      0.23        20
           lgbtphobia       0.78      0.75      0.76       171
            political       0.61      0.53      0.57       220
               racism       0.39      0.42      0.40        62
religious_intolerance       0.27      0.16      0.20        19
             misogyny       0.67      0.63      0.65       324
           xenophobia       0.39      0.22      0.28        78
                other       0.62      0.46      0.53       909
             not_hate       0.91      0.94      0.93      7177

            micro avg    