### Lnr Project Task 1.3 Bert

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

### Imports

In [57]:
import pandas as pd
from readerEXIST2025 import EXISTReader
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import os
import re
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback
import tempfile
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils
from sklearn.preprocessing import MultiLabelBinarizer
import json


### Read datasets

In [58]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask3, EnDevTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3")
SpTrainTask3, SpDevTask3 = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3")

SpTestTask3, EnTestTask3 = reader_test.get(lang="ES", subtask="3", include_ambiguous=True),  reader_test.get(lang="EN", subtask="3", include_ambiguous=True)

print(EnTrainTask3[1])
print("-------------------")

1       Writing a uni essay in my local pub with a cof...
2       @UniversalORL it is 2021 not 1921. I dont appr...
5       According to a customer I have plenty of time ...
6       So only 'blokes' drink beer? Sorry, but if you...
7       New to the shelves this week - looking forward...
                              ...                        
3255    idk why y’all bitches think having half your a...
3256    This has been a part of an experiment with @Wo...
3257    "Take me already" "Not yet. You gotta be ready...
3258    @clintneedcoffee why do you look like a whore?...
3259    ik when mandy says “you look like a whore” i l...
Name: text, Length: 2095, dtype: object
-------------------


### Preprocessing

In [59]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

### Set Seed

In [60]:
def set_seed(seed=2025):
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)


### Dataset class

In [61]:
class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True, rt='pt'):
        # if texts is list, just assign it; if numpy/pandas, convert to list
        if isinstance(texts, list):
            self.texts = texts
        else:
            self.texts = texts.tolist()
            
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=self.pad,
            truncation=self.trunc,
            return_tensors=self.rt
        )
        
        item = {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'id': self.ids[idx]
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item


### ICM Wrapper

In [62]:
def ICMWrapper(pred, labels, multi=False,ids=None):
    test = PyEvALLEvaluation()
    metrics=[MetricFactory.ICM.value]
    params= dict()
    fillLabel=None
    if multi:
        params[PyEvALLUtils.PARAM_REPORT]="embedded"
        hierarchy={"True":['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE'],
        "False":[]}
        params[PyEvALLUtils.PARAM_HIERARCHY]=hierarchy
        fillLabel = lambda x: ["False"] if len(x)== 0 else x
    else:
        params[PyEvALLUtils.PARAM_REPORT]="simple"
        fillLabel = lambda x: str(x)


    truth_name, predict_name=None, None
    if ids is None:
        ids=list(range(len(labels)))

    with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as truth:
        truth_name=truth.name
        truth_df=pd.DataFrame({'test_case': ['EXIST2025']*len(labels),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in labels]})
        if multi==True:
            truth_df=truth_df.astype('object')
        truth.write(truth_df.to_json(orient="records"))

    with  tempfile.NamedTemporaryFile(mode='w', delete=False) as predict:
        predict_name=predict.name
        predict_df=pd.DataFrame({'test_case': ['EXIST2025']*len(pred),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in pred]})
        if multi==True:
            predict_df=predict_df.astype('object')
        predict.write(predict_df.to_json(orient="records"))

    report = test.evaluate(predict_name, truth_name, metrics, **params)
    os.unlink(truth_name)
    os.unlink(predict_name)

    icm = None
    if 'metrics' in report.report:
        if 'ICM' in report.report["metrics"]: icm=float(report.report["metrics"]['ICM']["results"]["average_per_test_case"])
    return icm

### Metrics

In [67]:
def compute_metrics_3(pred, lencoder):
    labels = pred.label_ids
    #preds = pred.predictions.argmax(-1)
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy()
    preds_binary = (preds >= 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds_binary, average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds_binary)
    icm= ICMWrapper(lencoder.inverse_transform(preds_binary), lencoder.inverse_transform(labels), multi=True)
    # Macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    metrics = {}
    metrics.update({
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'ICM': icm
    })
    return metrics

### Pipeline

In [64]:
def sexism_classification_pipeline_task3(trainInfo, devInfo, testInfo=None, model_name='bert-base-uncased', nlabels=5, ptype="multi_label_classification", **args):
    # Model and Tokenizer
    labelEnc= MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        ignore_mismatched_sizes=True)

    # Prepare datasets
    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"ICM")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #compute_metrics=compute_metrics_3,
        compute_metrics = partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",10))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
      # Prepare test dataset for prediction
      test_dataset = SexismDatasetMulti(testInfo[1], [[0,0,0,0,0]] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

      # Predict test set labels
      predictions = trainer.predict(test_dataset)
      #predicted_labels = np.argmax(predictions.predictions, axis=1)
      predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
      predicted_labels = (predicted_probs >= 0.5).astype(int)

      # Create submission DataFrame
      submission_df = pd.DataFrame({
          'id': testInfo[0],
          'label': labelEnc.inverse_transform(predicted_labels),
          "test_case": ["EXIST2025"]*len(predicted_labels)

      })
      submission_df.to_csv('sexism_predictions_task3.csv', index=False)
      print("Prediction TASK3 completed. Results saved to sexism_predictions_task3.csv")
      return model, submission_df
    return model, eval_results

### Training and Evaluation

In [65]:
set_seed(23)

In [66]:
model = "bert-base-uncased"

params = {
    "num_train_epochs": 5,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "learning_rate": 1e-5,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results"
}
model_bert_en, results_bert_en = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    testInfo=None,
    model_name=model,
    **params
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6724,0.664214,0.479039,0.749451,0.584449,-0.129416
2,0.6451,0.64688,0.509034,0.596494,0.519975,-0.452479
3,0.6264,0.616054,0.729256,0.583306,0.595817,-0.785745
4,0.5753,0.580942,0.695196,0.7627,0.72218,0.136201
5,0.5178,0.581358,0.723532,0.667992,0.687315,-0.526377


2025-05-23 18:22:08,121 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:22:08,149 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:23:43,137 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:23:43,163 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:25:19,479 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:25:19,508 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:26:57,283 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:26:57,320 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:28:36,764 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:28:36,787 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:28:43,245 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:28:43,271 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5809418559074402, 'eval_precision_macro': 0.6951963388164534, 'eval_recall_macro': 0.7626999216170279, 'eval_f1_macro': 0.7221801910470803, 'eval_ICM': 0.13620083876419975, 'eval_runtime': 4.4487, 'eval_samples_per_second': 75.978, 'eval_steps_per_second': 1.349, 'epoch': 5.0}


Validation Results: {'eval_loss': 0.5809418559074402, 'eval_precision_macro': 0.6951963388164534, 'eval_recall_macro': 0.7626999216170279, 'eval_f1_macro': 0.7221801910470803, 'eval_ICM': 0.13620083876419975, 'eval_runtime': 4.4487, 'eval_samples_per_second': 75.978, 'eval_steps_per_second': 1.349, 'epoch': 5.0}


In [68]:
print(results_bert_en)

{'eval_loss': 0.5809418559074402, 'eval_precision_macro': 0.6951963388164534, 'eval_recall_macro': 0.7626999216170279, 'eval_f1_macro': 0.7221801910470803, 'eval_ICM': 0.13620083876419975, 'eval_runtime': 4.4487, 'eval_samples_per_second': 75.978, 'eval_steps_per_second': 1.349, 'epoch': 5.0}


### Save Model

In [69]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [70]:
model_bert_en.save_pretrained("models/bert_sexism")
tokenizer.save_pretrained("models/bert_sexism")

('models/bert_sexism/tokenizer_config.json',
 'models/bert_sexism/special_tokens_map.json',
 'models/bert_sexism/vocab.txt',
 'models/bert_sexism/added_tokens.json',
 'models/bert_sexism/tokenizer.json')

### Get previous YES statement

In [85]:
def get_ids(yes_ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)

    # Filter to only English tweets
    english_tweets = [item for item in test_json.values() if item["lang"] == "en"]

    # YES tweets (IDs present in yes_ids)
    yes_tweets = [t["tweet"] for t in english_tweets if t["id_EXIST"] in yes_ids]
    yes_ids_filtered = [t["id_EXIST"] for t in english_tweets if t["id_EXIST"] in yes_ids]

    # NO tweets (IDs not in yes_ids)
    no_tweets = [t["tweet"] for t in english_tweets if t["id_EXIST"] not in yes_ids]
    no_ids_filtered = [t["id_EXIST"] for t in english_tweets if t["id_EXIST"] not in yes_ids]

    print(f"Total English tweets in test set: {len(english_tweets)}")
    print(f"English tweets predicted YES: {len(yes_tweets)}")
    print(f"English tweets predicted NO: {len(no_tweets)}")

    return yes_ids_filtered, yes_tweets, no_ids_filtered, no_tweets


In [86]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['600002', '600004', '600005', '600006', '600013']


### Prediction

In [87]:
filtered_ids, filtered_tweets, no_ids, no_tweets = get_ids(yes_ids_bert)

cleaned_filtered_tweets = clean_text(filtered_tweets)

# --- 2. Load model + tokenizer + multi-label binarizer ---
model_path = "models/bert_sexism"  # update path to your multi-label model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# MultiLabelBinarizer with all possible task 3 labels
mlb = MultiLabelBinarizer()
mlb.fit([
    ["OBJECTIFICATION"], 
    ["SEXUAL-VIOLENCE"],
    ["STEREOTYPING-DOMINANCE"],
    ["IDEOLOGICAL-INEQUALITY"],
    ["MISOGYNY-NON-SEXUAL-VIOLENCE"]
    # Add all labels as needed
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- 4. Create dataset & dataloader ---
test_dataset = SexismDatasetMulti(
    texts=filtered_tweets,
    labels=None,               # No labels in inference
    ids=filtered_ids,
    tokenizer=tokenizer,
    max_len=128
)

test_loader = DataLoader(test_dataset, batch_size=32)

# --- 5. Predict multi-label outputs ---
all_preds = []

sigmoid = torch.nn.Sigmoid()

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = sigmoid(logits)  # Sigmoid to get probabilities per label

        # Apply threshold (e.g., 0.5) to decide which labels apply
        batch_preds = (probs > 0.5).cpu().numpy()
        all_preds.extend(batch_preds)

# --- 6. Decode multi-label predictions ---
# Convert list of predictions to numpy array
all_preds_array = np.array(all_preds)
pred_labels = mlb.inverse_transform(all_preds_array)

# Optional: print first 5 predictions
for tweet, labels in zip(filtered_tweets[:5], pred_labels[:5]):
    print(f"Tweet: {tweet}\nPredicted labels: {labels}\n")


Total English tweets in test set: 978
English tweets predicted YES: 430
English tweets predicted NO: 548
Tweet: @Cathy_TwoHalves @EverydaySexism That's why women have to stay at home, raise children and take care of the house.Nobody is going to assume that a woman is a leader among men since that's not how it is supposed to be in nature. Of course, you can try and fake it, but you will keep losing your feminine side.
Predicted labels: ('IDEOLOGICAL-INEQUALITY', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'STEREOTYPING-DOMINANCE')

Tweet: @JonPincince @kmulvane @EverydaySexism, even after Roe v. Wade was overturned. Fun! /s
Predicted labels: ('IDEOLOGICAL-INEQUALITY',)

Tweet: Seriously @JohnLewisRetail?  girls are pictured with ‘I’m a Stylist’ doll and the pizza oven - the boys get a robot and hot wheels garage. #EverydaySexism https://t.co/mKsbxaGgIV? # via @HPUKParents
Predicted labels: ('IDEOLOGICAL-INEQUALITY', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'STEREOTYPING-DOMINANCE')

Tweet

### Save to json

In [90]:
# After decoding pred_labels for the YES tweets
output_json = []

# YES predictions
for tweet_id, label in zip(filtered_ids, pred_labels):
    output_json.append({
        "id": str(tweet_id),
        "value": label,
        "test_case": "EXIST2025"
    })

# NO predictions (no task 2 value assigned)
for tweet_id in no_ids:
    output_json.append({
        "id": str(tweet_id),
        "value": "NO",  # Placeholder
        "test_case": "EXIST2025"
    })

# Optional: Sort by ID
output_json = sorted(output_json, key=lambda x: int(x["id"]))

# Save
with open("bert_task3_submission.json", "w") as f:
    json.dump(output_json, f, indent=4)

print("Saved predictions including NO tweets to 'bert_task3_submission.json'")


Saved predictions including NO tweets to 'bert_task3_submission.json'


### Spanish

### Pipeline

In [75]:
def sexism_classification_pipeline_task3_spanish(trainInfo, devInfo, testInfo=None, model_name='dccuchile/bert-base-spanish-wwm-cased', nlabels=5, ptype="multi_label_classification", **args):
    # Model and Tokenizer
    labelEnc= MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        ignore_mismatched_sizes=True)

    # Prepare datasets
    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"ICM")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #compute_metrics=compute_metrics_3,
        compute_metrics = partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",10))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
      # Prepare test dataset for prediction
      test_dataset = SexismDatasetMulti(testInfo[1], [[0,0,0,0,0]] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

      # Predict test set labels
      predictions = trainer.predict(test_dataset)
      #predicted_labels = np.argmax(predictions.predictions, axis=1)
      predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
      predicted_labels = (predicted_probs >= 0.5).astype(int)

      # Create submission DataFrame
      submission_df = pd.DataFrame({
          'id': testInfo[0],
          'label': labelEnc.inverse_transform(predicted_labels),
          "test_case": ["EXIST2025"]*len(predicted_labels)

      })
      submission_df.to_csv('sexism_predictions_task3.csv', index=False)
      print("Prediction TASK3 completed. Results saved to sexism_predictions_task3.csv")
      return model, submission_df
    return model, eval_results

### Model

In [77]:
model = "dccuchile/bert-base-spanish-wwm-cased"

params = {
    "num_train_epochs": 5,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "learning_rate": 1e-5,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results"
}
model_bert_en, results_bert_en = sexism_classification_pipeline_task3(
    EnTrainTask3,
    EnDevTask3,
    testInfo=None,
    model_name=model,
    **params
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6603,0.667164,0.397337,0.475897,0.410437,-0.830431
2,0.6432,0.650526,0.617245,0.483977,0.441208,-0.734095
3,0.6263,0.628984,0.673536,0.614387,0.636423,-0.464258
4,0.597,0.613683,0.673966,0.669319,0.665496,-0.194924
5,0.5401,0.613738,0.69111,0.651308,0.659788,-0.220338


2025-05-23 18:40:56,566 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:40:56,589 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:42:43,743 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:42:43,767 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:44:32,654 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:44:32,682 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:46:23,128 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:46:23,167 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:48:11,223 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:48:11,247 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method




2025-05-23 18:48:17,274 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-23 18:48:17,296 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.61368328332901, 'eval_precision_macro': 0.6739660395481791, 'eval_recall_macro': 0.6693191294085508, 'eval_f1_macro': 0.6654963344229851, 'eval_ICM': -0.1949238789607193, 'eval_runtime': 4.1437, 'eval_samples_per_second': 81.571, 'eval_steps_per_second': 1.448, 'epoch': 5.0}


Validation Results: {'eval_loss': 0.61368328332901, 'eval_precision_macro': 0.6739660395481791, 'eval_recall_macro': 0.6693191294085508, 'eval_f1_macro': 0.6654963344229851, 'eval_ICM': -0.1949238789607193, 'eval_runtime': 4.1437, 'eval_samples_per_second': 81.571, 'eval_steps_per_second': 1.448, 'epoch': 5.0}


### Save Model

In [78]:
from transformers import AutoTokenizer

model_name = "dccuchile/bert-base-spanish-wwm-cased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [79]:
model_bert_en.save_pretrained("models_spanish/bert-base-spanish-wwm-cased")
tokenizer.save_pretrained("models_spanish/bert-base-spanish-wwm-cased")

('models_spanish/bert-base-spanish-wwm-cased/tokenizer_config.json',
 'models_spanish/bert-base-spanish-wwm-cased/special_tokens_map.json',
 'models_spanish/bert-base-spanish-wwm-cased/vocab.txt',
 'models_spanish/bert-base-spanish-wwm-cased/added_tokens.json',
 'models_spanish/bert-base-spanish-wwm-cased/tokenizer.json')

### Get previous YES statements

In [91]:
def get_ids_spanish(yes_ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)

    # Spanish tweets only
    spanish_tweets = [item for item in test_json.values() if item["lang"] == "es"]

    # YES tweets
    yes_tweets = [t["tweet"] for t in spanish_tweets if t["id_EXIST"] in yes_ids]
    yes_ids_filtered = [t["id_EXIST"] for t in spanish_tweets if t["id_EXIST"] in yes_ids]

    # NO tweets (not in yes_ids)
    no_tweets = [t["tweet"] for t in spanish_tweets if t["id_EXIST"] not in yes_ids]
    no_ids_filtered = [t["id_EXIST"] for t in spanish_tweets if t["id_EXIST"] not in yes_ids]

    print(f"Spanish tweets predicted YES: {len(yes_tweets)}")
    print(f"Spanish tweets predicted NO: {len(no_tweets)}")

    return yes_ids_filtered, yes_tweets, no_ids_filtered, no_tweets


In [92]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission_Spanish.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['500004', '500012', '500019', '500020', '500022']


### Prediction

In [93]:
filtered_ids, filtered_tweets, no_ids, no_tweets = get_ids_spanish(yes_ids_bert)

cleaned_filtered_tweets = clean_text(filtered_tweets)

# --- 2. Load model + tokenizer + multi-label binarizer ---
model_path = "models_spanish/bert-base-spanish-wwm-cased"  # update path to your multi-label model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# MultiLabelBinarizer with all possible task 3 labels
mlb = MultiLabelBinarizer()
mlb.fit([
    ["OBJECTIFICATION"], 
    ["SEXUAL-VIOLENCE"],
    ["STEREOTYPING-DOMINANCE"],
    ["IDEOLOGICAL-INEQUALITY"],
    ["MISOGYNY-NON-SEXUAL-VIOLENCE"]
    # Add all labels as needed
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- 4. Create dataset & dataloader ---
test_dataset = SexismDatasetMulti(
    texts=filtered_tweets,
    labels=None,               # No labels in inference
    ids=filtered_ids,
    tokenizer=tokenizer,
    max_len=128
)

test_loader = DataLoader(test_dataset, batch_size=32)

# --- 5. Predict multi-label outputs ---
all_preds = []

sigmoid = torch.nn.Sigmoid()

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = sigmoid(logits)  # Sigmoid to get probabilities per label

        # Apply threshold (e.g., 0.5) to decide which labels apply
        batch_preds = (probs > 0.5).cpu().numpy()
        all_preds.extend(batch_preds)

# --- 6. Decode multi-label predictions ---
all_preds_array = np.array(all_preds)
pred_labels = mlb.inverse_transform(all_preds_array)

# Optional: print first 5 predictions
for tweet, labels in zip(filtered_tweets[:5], pred_labels[:5]):
    print(f"Tweet: {tweet}\nPredicted labels: {labels}\n")


Spanish tweets predicted YES: 567
Spanish tweets predicted NO: 531
Tweet: @jordirico Primero fue internet, luego el gamergate, la manosfera y su misoginia extrema sin que las plataformas movieran un dedo, los incel-asesinatos, la extrema derecha capitalizando el odio, la mimetización de estas ideas con las de un debate respetable y finalmente aquí estamos
Predicted labels: ()

Tweet: @dimplerrylover lo se pero literalmente la chica estaba siendo harassed like como puede sonreir asi
Predicted labels: ('MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE', 'STEREOTYPING-DOMINANCE')

Tweet: @Fistroman1 @ElioGatsby Sin pruebas??Qué fue en público.Qué lo vimos todos, qué me estás contando.Que no me mezcles churras con merinas.Qué de aquí a que me vengas con el metoo y el notallmen queda poquito y no me apetece acabar discutiendo de verdad, q lo veo venir.
Predicted labels: ('OBJECTIFICATION', 'SEXUAL-VIOLENCE', 'STEREOTYPING-DOMINANCE')

Tweet: @EstefaniaVeloz ...Con el proce

### Save to json

In [95]:
# After decoding pred_labels for the YES tweets
output_json = []

# YES predictions
for tweet_id, label in zip(filtered_ids, pred_labels):
    output_json.append({
        "id": str(tweet_id),
        "value": label,
        "test_case": "EXIST2025"
    })

# NO predictions (no task 2 value assigned)
for tweet_id in no_ids:
    output_json.append({
        "id": str(tweet_id),
        "value": "NO",  # Placeholder
        "test_case": "EXIST2025"
    })

# Optional: Sort by ID
output_json = sorted(output_json, key=lambda x: int(x["id"]))

# Save
with open("bert_task3_submission_Spanish.json", "w") as f:
    json.dump(output_json, f, indent=4)

print("Saved predictions including NO tweets to 'bert_task3_submission_Spanish.json'")


Saved predictions including NO tweets to 'bert_task3_submission_Spanish.json'


### File Merge

In [96]:
filename_english = "bert_task3_submission.json"
filename_spanish = "bert_task3_submission_Spanish.json"
filename_merged = "bert_task3_submission_merge.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
