### Lnr Project Task 1.2 Bert

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

### Imports

In [138]:
import pandas as pd
from readerEXIST2025 import EXISTReader
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import re
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import json

### Read datasets

In [139]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask2, EnDevTask2 = reader_train.get(lang="EN", subtask="2"), reader_dev.get(lang="EN", subtask="2")
SpTrainTask2, SpDevTask2 = reader_train.get(lang="ES", subtask="2"), reader_dev.get(lang="ES", subtask="2")

SpTestTask2, EnTestTask2 = reader_test.get(lang="ES", subtask="2", include_ambiguous=True),  reader_test.get(lang="EN", subtask="2", include_ambiguous=True)

print(EnTestTask2)
print("-------------------")

(Series([], Name: id, dtype: object), Series([], Name: text, dtype: object), Series([], Name: label2, dtype: object))
-------------------


### Preprocessing

In [140]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

### Set Seed

In [141]:
def set_seed(seed=2025):
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)


### Dataset class

In [142]:
class SexismDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True, rt='pt', preprocess=False):
        if preprocess:
            self.texts = [clean_text(t) for t in texts]
        else:
            # Fix: Only call tolist() if needed
            self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts

        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=self.pad,
            truncation=self.trunc,
            return_tensors=self.rt
        )

        item = {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            # Option 1:
            'id': torch.tensor(int(self.ids[idx]), dtype=torch.long)
            # Option 2 (if IDs are not numeric or not needed as tensors):
            # 'id': self.ids[idx]
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item


### Metrics

In [143]:
def compute_metrics_1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Pipeline

In [144]:
def sexism_classification_pipeline_task2(
    trainInfo,
    devInfo,
    testInfo=None,
    model_name='bert-base-uncased',
    nlabels=3,
    ptype="single_label_classification",
    dropout=0.1,  # Lägg till dropout här
    **args
):
    # Model and Tokenizer
    labelEnc = LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Modify the model configuration to include dropout
    config = AutoConfig.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        ignore_mismatched_sizes=True,
        hidden_dropout_prob=dropout,  # Dropout för de dolda lagren
        attention_probs_dropout_prob=dropout,  # Dropout för attention
    )

    # Load the model with the modified config
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Prepare datasets
    train_dataset = SexismDataset(
        trainInfo[1],
        labelEnc.fit_transform(trainInfo[2]),
        [int(x) for x in trainInfo[0]],
        tokenizer
    )
    val_dataset = SexismDataset(
        devInfo[1],
        labelEnc.transform(devInfo[2]),
        [int(x) for x in devInfo[0]],
        tokenizer
    )

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir=args.get('output_dir', './results'),
        num_train_epochs=args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay', 0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy', 'epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model', "f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_2,  # Se till att denna funktion är korrekt definierad
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience", 5))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(
            testInfo[1],
            [0] * len(testInfo[1]),
            [int(x) for x in testInfo[0]],
            tokenizer
        )

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"] * len(predicted_labels)
        })

        submission_df.to_csv('sexism_predictions_task2.csv', index=False)
        print("Prediction TASK2 completed. Results saved to sexism_predictions_task2.csv")
        return model, submission_df

    return model, eval_results


### Training and Evaluation

In [145]:
set_seed(23)

In [146]:
model = "bert-base-uncased"

params = {
    "num_train_epochs": 5,
    "learning_rate": 4e-05,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.03,
    "dropout": 0.2,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results"
}

model, results = sexism_classification_pipeline_task2(
    EnTrainTask2,
    EnDevTask2,
    model_name=model,
    **params
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0201,0.983787,0.582192,0.24531,0.194064,0.333333
2,0.9177,0.927492,0.582192,0.24531,0.194064,0.333333
3,0.8485,0.867599,0.609589,0.359003,0.366165,0.398455
4,0.7313,0.847163,0.589041,0.482475,0.578398,0.517494
5,0.5758,0.768077,0.678082,0.575305,0.602517,0.564366




Validation Results: {'eval_loss': 0.7680767774581909, 'eval_accuracy': 0.678082191780822, 'eval_f1': 0.5753049621902081, 'eval_precision': 0.6025165310879596, 'eval_recall': 0.5643663526016467, 'eval_runtime': 1.9041, 'eval_samples_per_second': 76.676, 'eval_steps_per_second': 1.576, 'epoch': 5.0}


Validation Results: {'eval_loss': 0.8539934754371643, 'eval_accuracy': 0.678082191780822, 'eval_f1': 0.548861323049933, 'eval_precision': 0.63264221158958, 'eval_recall': 0.5473644003055768, 'eval_runtime': 1.8623, 'eval_samples_per_second': 78.399, 'eval_steps_per_second': 1.611, 'epoch': 5.0}

In [89]:
results

{'eval_loss': 0.7757480144500732,
 'eval_accuracy': 0.6575342465753424,
 'eval_f1': 0.4984438984438985,
 'eval_precision': 0.6011382113821138,
 'eval_recall': 0.5178295560648501,
 'eval_runtime': 2.4288,
 'eval_samples_per_second': 60.112,
 'eval_steps_per_second': 1.235,
 'epoch': 5.0}

### Save Model

In [90]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [91]:
model.save_pretrained("models/bert_sexism")
tokenizer.save_pretrained("models/bert_sexism")

('models/bert_sexism/tokenizer_config.json',
 'models/bert_sexism/special_tokens_map.json',
 'models/bert_sexism/vocab.txt',
 'models/bert_sexism/added_tokens.json',
 'models/bert_sexism/tokenizer.json')

### Get previous YES statement

In [147]:
def get_ids(yes_ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)

    # Filter to only English tweets
    english_tweets = [item for item in test_json.values() if item["lang"] == "en"]

    # YES tweets (IDs present in yes_ids)
    yes_tweets = [t["tweet"] for t in english_tweets if t["id_EXIST"] in yes_ids]
    yes_ids_filtered = [t["id_EXIST"] for t in english_tweets if t["id_EXIST"] in yes_ids]

    # NO tweets (IDs not in yes_ids)
    no_tweets = [t["tweet"] for t in english_tweets if t["id_EXIST"] not in yes_ids]
    no_ids_filtered = [t["id_EXIST"] for t in english_tweets if t["id_EXIST"] not in yes_ids]

    print(f"Total English tweets in test set: {len(english_tweets)}")
    print(f"English tweets predicted YES: {len(yes_tweets)}")
    print(f"English tweets predicted NO: {len(no_tweets)}")

    return yes_ids_filtered, yes_tweets, no_ids_filtered, no_tweets


In [148]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['600002', '600004', '600005', '600006', '600013']


### Prediction

In [149]:
filtered_ids, filtered_tweets, no_ids, no_tweets = get_ids(yes_ids_bert)

cleaned_filtered_tweets = clean_text(filtered_tweets)

# --- 2. Load model + tokenizer + label encoder ---
model_path = "models/bert_sexism"  # path to your trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

labelEnc = LabelEncoder()
labelEnc.fit(["DIRECT", "REPORTED", "JUDGEMENTAL"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- 4. Create dataset & dataloader ---
test_dataset = SexismDataset(
    texts=filtered_tweets,
    labels=None,                     # No labels for inference
    ids=filtered_ids,
    tokenizer=tokenizer,
    max_len=128
)
test_loader = DataLoader(test_dataset, batch_size=32)

# --- 5. Predict ---
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())

# --- 6. Decode labels ---
pred_labels = labelEnc.inverse_transform(all_preds)

Total English tweets in test set: 978
English tweets predicted YES: 430
English tweets predicted NO: 548


### Save to json

In [151]:
# After decoding pred_labels for the YES tweets
output_json = []

# YES predictions
for tweet_id, label in zip(filtered_ids, pred_labels):
    output_json.append({
        "id": str(tweet_id),
        "value": label,
        "test_case": "EXIST2025"
    })

# NO predictions (no task 2 value assigned)
for tweet_id in no_ids:
    output_json.append({
        "id": str(tweet_id),
        "value": "NO",  # Placeholder
        "test_case": "EXIST2025"
    })

# Optional: Sort by ID
output_json = sorted(output_json, key=lambda x: int(x["id"]))

# Save
with open("bert_task2_submission.json", "w") as f:
    json.dump(output_json, f, indent=4)

print("Saved predictions including NO tweets to 'bert_task2_submission.json'")


Saved predictions including NO tweets to 'bert_task2_submission.json'


### Spanish

### Pipeline

In [160]:
def sexism_classification_pipeline_task2_Spanish(
    trainInfo,
    devInfo,
    testInfo=None,
    model_name='dccuchile/bert-base-spanish-wwm-cased',
    nlabels=3,
    ptype="single_label_classification",
    dropout=0.1,  # Lägg till dropout här
    **args
):
    # Model and Tokenizer
    labelEnc = LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Modify the model configuration to include dropout
    config = AutoConfig.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        ignore_mismatched_sizes=True,
        hidden_dropout_prob=dropout,  # Dropout för de dolda lagren
        attention_probs_dropout_prob=dropout,  # Dropout för attention
    )

    # Load the model with the modified config
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Prepare datasets
    train_dataset = SexismDataset(
        trainInfo[1],
        labelEnc.fit_transform(trainInfo[2]),
        [int(x) for x in trainInfo[0]],
        tokenizer
    )
    val_dataset = SexismDataset(
        devInfo[1],
        labelEnc.transform(devInfo[2]),
        [int(x) for x in devInfo[0]],
        tokenizer
    )

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir=args.get('output_dir', './results'),
        num_train_epochs=args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay', 0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy', 'epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model', "f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_2,  # Se till att denna funktion är korrekt definierad
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience", 5))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(
            testInfo[1],
            [0] * len(testInfo[1]),
            [int(x) for x in testInfo[0]],
            tokenizer
        )

        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            "test_case": ["EXIST2025"] * len(predicted_labels)
        })

        submission_df.to_csv('sexism_predictions_task2.csv', index=False)
        print("Prediction TASK2 completed. Results saved to sexism_predictions_task2.csv")
        return model, submission_df

    return model, eval_results


### Model

In [129]:
model = "dccuchile/bert-base-spanish-wwm-cased"

params = {
    "num_train_epochs": 5,
    "learning_rate": 4e-05,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.03,
    "dropout": 0.2,
    "early_stopping_patience": 5,
    "output_dir": "./bert_results"
}

model, results = sexism_classification_pipeline_task2_Spanish(
    SpTrainTask2,
    SpDevTask2,
    model_name=model,
    **params
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9171,0.957847,0.560386,0.239422,0.186795,0.333333
2,0.7214,0.854414,0.584541,0.410331,0.457273,0.440585
3,0.66,0.856922,0.657005,0.533032,0.654508,0.534759
4,0.5395,0.822236,0.637681,0.496247,0.556389,0.523197
5,0.4346,0.993365,0.695652,0.609763,0.660824,0.592574




Validation Results: {'eval_loss': 0.9933649897575378, 'eval_accuracy': 0.6956521739130435, 'eval_f1': 0.6097630197247056, 'eval_precision': 0.660823754789272, 'eval_recall': 0.5925738111336488, 'eval_runtime': 2.3586, 'eval_samples_per_second': 87.762, 'eval_steps_per_second': 1.696, 'epoch': 5.0}


Validation Results: {'eval_loss': 0.9933649897575378, 'eval_accuracy': 0.6956521739130435, 'eval_f1': 0.6097630197247056, 'eval_precision': 0.660823754789272, 'eval_recall': 0.5925738111336488, 'eval_runtime': 2.3586, 'eval_samples_per_second': 87.762, 'eval_steps_per_second': 1.696, 'epoch': 5.0}

### Save Model

In [130]:
from transformers import AutoTokenizer

model_name = "dccuchile/bert-base-spanish-wwm-cased"  # or the model you actually used
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [131]:
model.save_pretrained("models_spanish/bert-base-spanish-wwm-cased")
tokenizer.save_pretrained("models_spanish/bert-base-spanish-wwm-cased")

('models_spanish/bert-base-spanish-wwm-cased/tokenizer_config.json',
 'models_spanish/bert-base-spanish-wwm-cased/special_tokens_map.json',
 'models_spanish/bert-base-spanish-wwm-cased/vocab.txt',
 'models_spanish/bert-base-spanish-wwm-cased/added_tokens.json',
 'models_spanish/bert-base-spanish-wwm-cased/tokenizer.json')

### Get previous YES statements

In [161]:
def get_ids_spanish(yes_ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)

    # Spanish tweets only
    spanish_tweets = [item for item in test_json.values() if item["lang"] == "es"]

    # YES tweets
    yes_tweets = [t["tweet"] for t in spanish_tweets if t["id_EXIST"] in yes_ids]
    yes_ids_filtered = [t["id_EXIST"] for t in spanish_tweets if t["id_EXIST"] in yes_ids]

    # NO tweets (not in yes_ids)
    no_tweets = [t["tweet"] for t in spanish_tweets if t["id_EXIST"] not in yes_ids]
    no_ids_filtered = [t["id_EXIST"] for t in spanish_tweets if t["id_EXIST"] not in yes_ids]

    print(f"Spanish tweets predicted YES: {len(yes_tweets)}")
    print(f"Spanish tweets predicted NO: {len(no_tweets)}")

    return yes_ids_filtered, yes_tweets, no_ids_filtered, no_tweets


In [162]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission_Spanish.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['500004', '500012', '500019', '500020', '500022']


### Prediction

In [165]:
filtered_ids, filtered_tweets, no_ids, no_tweets = get_ids_spanish(yes_ids_bert)

cleaned_filtered_tweets = clean_text(filtered_tweets)

# --- 2. Load model + tokenizer + label encoder ---
model_path = "models_spanish/bert-base-spanish-wwm-cased"  # path to your trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

labelEnc = LabelEncoder()
labelEnc.fit(["DIRECT", "REPORTED", "JUDGEMENTAL"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- 4. Create dataset & dataloader ---
test_dataset = SexismDataset(
    texts=filtered_tweets,
    labels=None,                     # No labels for inference
    ids=filtered_ids,
    tokenizer=tokenizer,
    max_len=128
)
test_loader = DataLoader(test_dataset, batch_size=32)

# --- 5. Predict ---
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())

# --- 6. Decode labels ---
pred_labels = labelEnc.inverse_transform(all_preds)

Spanish tweets predicted YES: 567
Spanish tweets predicted NO: 531
['500001', '500002', '500003', '500005', '500006']


### Save to json

In [169]:
# After decoding pred_labels for the YES tweets
output_json = []

# YES predictions
for tweet_id, label in zip(filtered_ids, pred_labels):
    output_json.append({
        "id": str(tweet_id),
        "value": label,
        "test_case": "EXIST2025"
    })

# NO predictions (no task 2 value assigned)
for tweet_id in no_ids:
    output_json.append({
        "id": str(tweet_id),
        "value": "NO",  # Placeholder
        "test_case": "EXIST2025"
    })

# Optional: Sort by ID
output_json = sorted(output_json, key=lambda x: int(x["id"]))

# Save
with open("bert_task2_submission_Spanish.json", "w") as f:
    json.dump(output_json, f, indent=4)

print("Saved predictions including NO tweets to 'bert_task2_submission_Spanish.json'")


Saved predictions including NO tweets to 'bert_task2_submission_Spanish.json'


### File Merge

In [170]:
filename_english = "bert_task2_submission.json"
filename_spanish = "bert_task2_submission_Spanish.json"
filename_merged = "bert_task2_submission_merge.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
