### Lnr Project Task 1.1 Bert

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

### Imports

In [1]:
import pandas as pd
from readerEXIST2025 import EXISTReader
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import os
import re
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback
import random

### Read datasets

In [2]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask1, EnDevTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1")
SpTrainTask1, SpDevTask1 = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1")

SpTestTask1, EnTestTask1 = reader_test.get(lang="ES", subtask="1", include_ambiguous=True),  reader_test.get(lang="EN", subtask="1", include_ambiguous=True)

print(EnTrainTask1[1])
print("-------------------")

1       Writing a uni essay in my local pub with a cof...
2       @UniversalORL it is 2021 not 1921. I dont appr...
5       According to a customer I have plenty of time ...
6       So only 'blokes' drink beer? Sorry, but if you...
7       New to the shelves this week - looking forward...
                              ...                        
3255    idk why y’all bitches think having half your a...
3256    This has been a part of an experiment with @Wo...
3257    "Take me already" "Not yet. You gotta be ready...
3258    @clintneedcoffee why do you look like a whore?...
3259    ik when mandy says “you look like a whore” i l...
Name: text, Length: 2870, dtype: object
-------------------


In [3]:
from collections import Counter

label_counts = Counter(EnTrainTask1[2])
print("NO:", label_counts["NO"])
print("YES:", label_counts["YES"])

NO: 1733
YES: 1137


### Preprocessing

In [4]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

### Set Seed

In [5]:
def set_seed(seed=2025):
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

### Dataset class

In [6]:
class SexismDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt', preprocess=False):
        if preprocess:
            self.texts = [clean_text(t) for t in texts]
        else:
            self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)
        }

### Metrics

In [7]:
def compute_metrics_1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Pipeline

In [8]:
def sexism_classification_pipeline_task1(trainInfo, devInfo, testInfo=None, model_name='bert-base-uncased', nlabels=2, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc= LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )

    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer, preprocess=False)
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer, preprocess=False)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_1,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"]*len(predicted_labels)
        })
        submission_df.to_csv('sexism_predictions_task1.csv', index=False)
        print("Prediction for TASK 1 completed. Results saved to sexism_predictions_task1.csv")
        return model, submission_df
    return model, eval_results


### Training and Evaluation

In [9]:
set_seed(23)

model = "bert-base-uncased"

params = {
    "num_train_epochs": 5,
    "learning_rate": 4e-05,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.03,
    "dropout": 0.2,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results_task2"
}

model, results = sexism_classification_pipeline_task1(
    EnTrainTask1,
    EnDevTask1,
    EnTestTask1,
    model_name=model,
    nlabels=3,
    ptype="single_label_classification",
    **params
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.515,0.441118,0.822072,0.796915,0.794872,0.798969
2,0.3598,0.478284,0.817568,0.769231,0.859873,0.695876
3,0.2565,0.388515,0.844595,0.830467,0.793427,0.871134
4,0.0481,0.667895,0.831081,0.794521,0.847953,0.747423
5,0.0047,0.692557,0.837838,0.807487,0.838889,0.778351




Validation Results: {'eval_loss': 0.38851499557495117, 'eval_accuracy': 0.8445945945945946, 'eval_f1': 0.8304668304668305, 'eval_precision': 0.7934272300469484, 'eval_recall': 0.8711340206185567, 'eval_runtime': 5.4287, 'eval_samples_per_second': 81.788, 'eval_steps_per_second': 1.289, 'epoch': 5.0}




Prediction for TASK 1 completed. Results saved to sexism_predictions_task1.csv


Validation Results: {'eval_loss': 0.6463983058929443, 'eval_accuracy': 0.8536036036036037, 'eval_f1': 0.8284960422163589, 'eval_precision': 0.8486486486486486, 'eval_recall': 0.8092783505154639, 'eval_runtime': 6.0618, 'eval_samples_per_second': 73.245, 'eval_steps_per_second': 1.155, 'epoch': 5.0}

In [11]:
results.head()

Unnamed: 0,id,label,test_case
0,600001,YES,EXIST2025
1,600002,YES,EXIST2025
2,600003,YES,EXIST2025
3,600004,YES,EXIST2025
4,600005,YES,EXIST2025


### Save Model

In [12]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
model.save_pretrained("models/bert_sexism")
tokenizer.save_pretrained("models/bert_sexism")

('models/bert_sexism/tokenizer_config.json',
 'models/bert_sexism/special_tokens_map.json',
 'models/bert_sexism/vocab.txt',
 'models/bert_sexism/added_tokens.json',
 'models/bert_sexism/tokenizer.json')

### Save to json

In [14]:
import json

# Load your CSV results
df = pd.read_csv("sexism_predictions_task1.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("bert_task1_submission.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to bert_task1_submission.json ✅")


Saved to bert_task1_submission.json ✅


### Spanish

In [15]:
def sexism_classification_pipeline_task1_Spanish(trainInfo, devInfo, testInfo=None, model_name="dccuchile/bert-base-spanish-wwm-cased", nlabels=2, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc= LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )

    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer, preprocess=False)
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer, preprocess=False)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_1,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"]*len(predicted_labels)
        })
        submission_df.to_csv('sexism_predictions_task1_Spanish.csv', index=False)
        print("Prediction for TASK 1 completed. Results saved to sexism_predictions_task1_Spanish.csv")
        return model, submission_df
    return model, eval_results


### Model

In [16]:
set_seed(23)

model = "dccuchile/bert-base-spanish-wwm-cased"


params = {
    "num_train_epochs": 5,
    "learning_rate": 4e-05,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.03,
    "dropout": 0.2,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results"
}

model, results = sexism_classification_pipeline_task1_Spanish(
    SpTrainTask1,
    SpDevTask1,
    SpTestTask1,
    model_name=model,
    nlabels=2,
    ptype="single_label_classification",
    **params
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4217,0.453544,0.789796,0.820244,0.753205,0.900383
2,0.3504,0.388428,0.820408,0.829457,0.839216,0.819923
3,0.3653,0.474859,0.808163,0.819923,0.819923,0.819923
4,0.091,0.930702,0.808163,0.809717,0.858369,0.766284
5,0.0421,1.089498,0.812245,0.821012,0.833992,0.808429




Validation Results: {'eval_loss': 0.38842788338661194, 'eval_accuracy': 0.8204081632653061, 'eval_f1': 0.8294573643410853, 'eval_precision': 0.8392156862745098, 'eval_recall': 0.8199233716475096, 'eval_runtime': 6.5595, 'eval_samples_per_second': 74.701, 'eval_steps_per_second': 1.22, 'epoch': 5.0}




Prediction for TASK 1 completed. Results saved to sexism_predictions_task1_Spanish.csv


Validation Results: {'eval_loss': 0.38774237036705017, 'eval_accuracy': 0.826530612244898, 'eval_f1': 0.834307992202729, 'eval_precision': 0.8492063492063492, 'eval_recall': 0.8199233716475096, 'eval_runtime': 7.1635, 'eval_samples_per_second': 68.402, 'eval_steps_per_second': 1.117, 'epoch': 5.0}

### Save Model

In [17]:
from transformers import AutoTokenizer

model_name = "dccuchile/bert-base-spanish-wwm-cased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Save to json

In [18]:
import json

# Load your CSV results
df = pd.read_csv("sexism_predictions_task1_Spanish.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("bert_task1_submission_Spanish.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to bert_task1_submission_Spanish.json ✅")


Saved to bert_task1_submission_Spanish.json ✅


### File Merge

In [19]:
filename_english = "bert_task1_submission.json"
filename_spanish = "bert_task1_submission_Spanish.json"
filename_merged = "bert_task1_submission_merge.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
