### Lnr Project Task 1.1 Gradient Boosted Trees (e.g., XGBoost) + TF-IDF

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from readerEXIST2025 import EXISTReader
import re
from xgboost import XGBClassifier

### Read datasets

In [3]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask1, EnDevTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1")
SpTrainTask1, SpDevTask1 = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1")

SpTestTask1, EnTestTask1 = reader_test.get(lang="ES", subtask="1", include_ambiguous=True),  reader_test.get(lang="EN", subtask="1", include_ambiguous=True)

print(EnTrainTask1[1])
print("-------------------")

1       Writing a uni essay in my local pub with a cof...
2       @UniversalORL it is 2021 not 1921. I dont appr...
5       According to a customer I have plenty of time ...
6       So only 'blokes' drink beer? Sorry, but if you...
7       New to the shelves this week - looking forward...
                              ...                        
3255    idk why y’all bitches think having half your a...
3256    This has been a part of an experiment with @Wo...
3257    "Take me already" "Not yet. You gotta be ready...
3258    @clintneedcoffee why do you look like a whore?...
3259    ik when mandy says “you look like a whore” i l...
Name: text, Length: 2870, dtype: object
-------------------


In [4]:
from collections import Counter

label_counts = Counter(EnTrainTask1[2])
print("NO:", label_counts["NO"])
print("YES:", label_counts["YES"])

NO: 1733
YES: 1137


### Preprocessing

In [5]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

In [None]:
# Extract tweets and labels from EnTrainTask1 and EnDevTask1
train_texts = EnTrainTask1[1]  # list of tweets
train_labels = EnTrainTask1[2] # list of labels ("YES" or "NO")

dev_texts = EnDevTask1[1]
dev_labels = EnDevTask1[2]

# Convert labels to binary (1 for YES, 0 for NO)
train_labels_bin = [1 if label == "YES" else 0 for label in train_labels]
dev_labels_bin = [1 if label == "YES" else 0 for label in dev_labels]

# Clean the tweets
train_texts_clean = clean_text(train_texts)
dev_texts_clean = clean_text(dev_texts)
test_texts_clean = clean_text(EnTestTask1[1])  # Clean test texts


### Training and Evaluation

In [7]:
def compute_metrics(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0, pos_label=1
    )
    acc = accuracy_score(y_true, y_pred)

    print(f"Accuracy: {acc:.3f}")
    print(f"Binary F1: {f1:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")


### Save results to csv

In [8]:
# === Vectorize the tweets ===
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_texts_clean)
X_dev_tfidf = tfidf.transform(dev_texts_clean)
X_test_tfidf = tfidf.transform(test_texts_clean)

# === Train the model ===
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_tfidf, train_labels_bin)

# === Evaluate on dev set ===
dev_preds_bin = xgb_clf.predict(X_dev_tfidf)
compute_metrics(dev_labels_bin, dev_preds_bin)

# === Predict on test set ===
test_preds_bin = xgb_clf.predict(X_test_tfidf)
test_preds_label = ["YES" if p == 1 else "NO" for p in test_preds_bin]

# === Save test predictions to CSV ===
submission_test_df = pd.DataFrame({
    "id": EnTestTask1[0],
    "label": test_preds_label,
    "test_case": ["EXIST2025"] * len(test_preds_label)
})
submission_test_df.to_csv("xgb_predictions_task1.csv", index=False)
print("Saved test predictions to xgb_predictions_task1.csv ✅")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.797
Binary F1: 0.747
Precision: 0.821
Recall: 0.686
Saved test predictions to xgb_predictions_task1.csv ✅


### Save to json

In [9]:
import json

# Load your CSV results
df = pd.read_csv("xgb_predictions_task1.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("xgb_task1_submission.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to xgb_task1_submission.json ✅")


Saved to xgb_task1_submission.json ✅


### Spanish

In [10]:
# Extract tweets and labels from EnTrainTask1 and EnDevTask1
train_texts = SpTrainTask1[1]  # list of tweets
train_labels = SpTrainTask1[2] # list of labels ("YES" or "NO")

dev_texts = SpDevTask1[1]
dev_labels = SpDevTask1[2]

# Convert labels to binary (1 for YES, 0 for NO)
train_labels_bin = [1 if label == "YES" else 0 for label in train_labels]
dev_labels_bin = [1 if label == "YES" else 0 for label in dev_labels]

# Clean the tweets
train_texts_clean = clean_text(train_texts)
dev_texts_clean = clean_text(dev_texts)
test_texts_clean = clean_text(SpTestTask1[1])  # Clean test texts

### Save to csv

In [11]:
spanish_stopwords = [
    'de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las',
    'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como',
    'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta',
    'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta',
    'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos',
    'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos',
    'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro',
    'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes',
    'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas',
    'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus',
    'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía',
    'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya',
    'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras',
    'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas',
    'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 'esté',
    'estés', 'estemos', 'estéis', 'estén', 'estaré', 'estarás', 'estará',
    'estaremos', 'estaréis', 'estarán', 'estaría', 'estarías',
    'estaríamos', 'estaríais', 'estarían', 'estaba', 'estabas', 'estábamos',
    'estabais', 'estaban'
]


In [12]:
# === Vectorize the tweets ===
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words=spanish_stopwords)
X_train_tfidf = tfidf.fit_transform(train_texts_clean)
X_dev_tfidf = tfidf.transform(dev_texts_clean)
X_test_tfidf = tfidf.transform(test_texts_clean)

# === Train the model ===
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_tfidf, train_labels_bin)

# === Evaluate on dev set ===
dev_preds_bin = xgb_clf.predict(X_dev_tfidf)
compute_metrics(dev_labels_bin, dev_preds_bin)

# === Predict on test set ===
test_preds_bin = xgb_clf.predict(X_test_tfidf)
test_preds_label = ["YES" if p == 1 else "NO" for p in test_preds_bin]

# === Save test predictions to CSV ===
submission_test_df = pd.DataFrame({
    "id": SpTestTask1[0],
    "label": test_preds_label,
    "test_case": ["EXIST2025"] * len(test_preds_label)
})
submission_test_df.to_csv("xgb_predictions_task1_Spanish.csv", index=False)
print("Saved test predictions to xgb_predictions_task1_Spanish.csv ✅")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.755
Binary F1: 0.752
Precision: 0.816
Recall: 0.697
Saved test predictions to xgb_predictions_task1_Spanish.csv ✅


### Save to json

In [13]:
import json

# Load your CSV results
df = pd.read_csv("xgb_predictions_task1_Spanish.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("xgb_task1_submission_Spanish.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to xgb_task1_submission_Spanish.json ✅")


Saved to xgb_task1_submission_Spanish.json ✅


### Merge

In [14]:
filename_english = "xgb_task1_submission.json"
filename_spanish = "xgb_task1_submission_Spanish.json"
filename_merged = "xgb_task1_submission_merged.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
