### Lnr Project Task 1.3 #  Logistic Regression + TF-IDF (Bag of words)

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

### Imports

In [71]:
from readerEXIST2025 import EXISTReader
import re
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.multiclass import OneVsRestClassifier
import numpy as np



### Load Dataset

In [72]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask3, EnDevTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3")
SpTrainTask3, SpDevTask3 = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3")

SpTestTask3, EnTestTask3 = reader_test.get(lang="ES", subtask="3", include_ambiguous=True),  reader_test.get(lang="EN", subtask="3", include_ambiguous=True)

print(EnTrainTask3[1])
print("-------------------")

1       Writing a uni essay in my local pub with a cof...
2       @UniversalORL it is 2021 not 1921. I dont appr...
5       According to a customer I have plenty of time ...
6       So only 'blokes' drink beer? Sorry, but if you...
7       New to the shelves this week - looking forward...
                              ...                        
3255    idk why y’all bitches think having half your a...
3256    This has been a part of an experiment with @Wo...
3257    "Take me already" "Not yet. You gotta be ready...
3258    @clintneedcoffee why do you look like a whore?...
3259    ik when mandy says “you look like a whore” i l...
Name: text, Length: 2095, dtype: object
-------------------


### Preprocessing

In [73]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

In [74]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, multilabel_confusion_matrix

def compute_metrics(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    acc = accuracy_score(y_true, y_pred)

    print(f"Accuracy (subset accuracy): {acc:.3f}")
    print(f"Macro F1: {f1:.3f}")
    print(f"Macro Precision: {precision:.3f}")
    print(f"Macro Recall: {recall:.3f}")

    # Compute multilabel confusion matrix
    icm_matrices = multilabel_confusion_matrix(y_true, y_pred)

    # Sum TP, FP, FN across all classes
    TP = 0
    FP = 0
    FN = 0
    for matrix in icm_matrices:
        tn, fp, fn, tp = matrix.ravel()
        TP += tp
        FP += fp
        FN += fn

    icm_value = TP / (TP + FP + FN) if (TP + FP + FN) > 0 else 0
    print(f"Singular ICM value (overall label accuracy): {icm_value:.3f}")


In [75]:
def save_predictions_to_json(yes_ids, yes_labels, output_path, lang="en"):
    import json

    # Ensure yes_ids are strings
    yes_ids = [str(id_).replace("id_", "") for id_ in yes_ids]

    # Load test data
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)

    # Get all IDs for the selected language
    all_ids = [
        str(tweet_data["id_EXIST"])
        for tweet_data in test_json.values()
        if tweet_data["lang"] == lang
    ]

    # Determine NO IDs
    yes_ids_set = set(yes_ids)
    all_ids_set = set(all_ids)
    no_ids = list(all_ids_set - yes_ids_set)

    # Build output JSON
    output_data = []

    # Add YES predictions
    for tweet_id, label in zip(yes_ids, yes_labels):
        output_data.append({
            "id": tweet_id,
            "value": label,
            "test_case": "EXIST2025"
        })

    # Add NO predictions
    for tweet_id in no_ids:
        output_data.append({
            "id": tweet_id,
            "value": "NO",
            "test_case": "EXIST2025"
        })

    # Sort by ID numerically
    output_data.sort(key=lambda x: int(x["id"]))

    # Save to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(output_data)} predictions to {output_path} (YES: {len(yes_ids)}, NO: {len(no_ids)})")


In [76]:
# === 2. Extract training and validation data ===
training_tweets_raw = EnTrainTask3[1]      # Raw tweet texts from the training set
training_labels_raw = EnTrainTask3[2]      # Corresponding labels for training tweets

validation_tweets_raw = EnDevTask3[1]      # Raw tweet texts from the validation set
validation_labels_raw = EnDevTask3[2]      # Corresponding labels for validation tweets

# === 3. Clean the tweets (e.g., remove URLs, mentions, special characters) ===
cleaned_training_tweets = clean_text(training_tweets_raw)
cleaned_validation_tweets = clean_text(validation_tweets_raw)

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform on training labels
encoded_training_labels = mlb.fit_transform(training_labels_raw)

# Transform validation labels using the same mlb (no refitting)
encoded_validation_labels = mlb.transform(validation_labels_raw)

print("Classes:", mlb.classes_)
print("Encoded shape:", encoded_training_labels.shape)


Classes: ['IDEOLOGICAL-INEQUALITY' 'MISOGYNY-NON-SEXUAL-VIOLENCE' 'OBJECTIFICATION'
 'SEXUAL-VIOLENCE' 'STEREOTYPING-DOMINANCE']
Encoded shape: (2095, 5)


### Train model

In [None]:
# === 4. Encode string labels as multi-label binarized vectors ===
mlb = MultiLabelBinarizer()
encoded_training_labels = mlb.fit_transform(training_labels_raw)
encoded_validation_labels = mlb.transform(validation_labels_raw)

# === 5. Convert cleaned text data into TF-IDF features ===
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # unigrams and bigrams
X_train_tfidf = tfidf_vectorizer.fit_transform(cleaned_training_tweets)
X_val_tfidf = tfidf_vectorizer.transform(cleaned_validation_tweets)

# === 6. Train a One-vs-Rest Logistic Regression classifier for multi-label ===
logistic_regressor = R(LogisticRegression(max_iter=1000, class_weight='balanced'))
logistic_regressor.fit(X_train_tfidf, encoded_training_labels)

# === 7. Predict and evaluate on the validation set ===
validation_predictions = logistic_regressor.predict(X_val_tfidf)

# Calculate and display custom evaluation metrics
compute_metrics(encoded_validation_labels, validation_predictions)

print("Note: This evaluation is performed on the full validation set, not just tweets predicted as YES by previous models.")

Accuracy (subset accuracy): 0.136
Macro F1: 0.652
Macro Precision: 0.699
Macro Recall: 0.614
Singular ICM value (overall label accuracy): 0.492
Note: This evaluation is performed on the full validation set, not just tweets predicted as YES by previous models.


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_

In [78]:
class_counts = encoded_training_labels.sum(axis=0)  # Sum over samples for each class

for cls, count in zip(mlb.classes_, class_counts):
    print(f"Class {cls}: {count} samples")

Class IDEOLOGICAL-INEQUALITY: 1209 samples
Class MISOGYNY-NON-SEXUAL-VIOLENCE: 868 samples
Class OBJECTIFICATION: 1083 samples
Class SEXUAL-VIOLENCE: 651 samples
Class STEREOTYPING-DOMINANCE: 1337 samples


### Get previous YES statements

In [79]:
def get_ids(ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)
    # Filter to only English tweets in test set (optional sanity check)
    tweets_all = [item for item in test_json.values() if item["lang"] == "en"]

    print(f"Total English tweets in test set: {len(tweets_all)}")

    # Filter only English test tweets where id_EXIST is in yes_ids
    yes_tweets = [
        tweet_data["tweet"]
        for tweet_data in test_json.values()
        if tweet_data["lang"] == "en" and tweet_data["id_EXIST"] in ids
    ]

    yes_en_ids = [
        tweet_data["id_EXIST"]
        for tweet_data in test_json.values()
        if tweet_data["lang"] == "en" and tweet_data["id_EXIST"] in ids
    ]

    print(f"Number of English tweets predicted YES: {len(yes_tweets)}")
    print(yes_en_ids[:5])
    print(yes_tweets[:5])
    
    return yes_en_ids, yes_tweets

### Bert (Best perfomance in tast 1.1)

In [80]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['600002', '600004', '600005', '600013', '600018']


In [81]:
# Assuming yes_ids_bert is a list of IDs to filter on
filtered_ids, filtered_tweets = get_ids(yes_ids_bert)

# Clean the filtered tweets
cleaned_filtered_tweets = clean_text(filtered_tweets)

# Transform tweets to TF-IDF features
X_filtered_tfidf = tfidf_vectorizer.transform(cleaned_filtered_tweets)

# Predict multi-label outputs (binary indicator arrays)
filtered_predictions = logistic_regressor.predict(X_filtered_tfidf)

# Convert binary predictions back to label sets (list of labels per tweet)
filtered_pred_labels = mlb.inverse_transform(filtered_predictions)

# Print first 5 tweets with their predicted labels
for tweet, labels in zip(filtered_tweets[:5], filtered_pred_labels[:5]):
    print(f"Tweet: {tweet}\nPredicted labels: {labels}\n")

# Save predictions to JSON (adjusted for multi-label format)
save_predictions_to_json(filtered_ids, filtered_pred_labels, "BoW_task3_submission.json", lang="en")


Total English tweets in test set: 978
Number of English tweets predicted YES: 460
['600002', '600004', '600005', '600013', '600018']
["@Cathy_TwoHalves @EverydaySexism That's why women have to stay at home, raise children and take care of the house.Nobody is going to assume that a woman is a leader among men since that's not how it is supposed to be in nature. Of course, you can try and fake it, but you will keep losing your feminine side.", '@JonPincince @kmulvane @EverydaySexism, even after Roe v. Wade was overturned. Fun! /s', 'Seriously @JohnLewisRetail?  girls are pictured with ‘I’m a Stylist’ doll and the pizza oven - the boys get a robot and hot wheels garage. #EverydaySexism https://t.co/mKsbxaGgIV? # via @HPUKParents', '@tired_of_debate @ScoutSleepe @ImWatson91 The fact that some men assault transwomen too is awful &amp; unacceptable.Having said that—being harassed by men does NOT render MtFs “women” (+men *do* harass other men sometimes, too…)+It is notable that—for ALL sexua

### Spanish

In [82]:
# === 2. Extract training and validation data ===
training_tweets_raw = SpTrainTask3[1]      # Raw tweet texts from the training set
training_labels_raw = SpTrainTask3[2]      # Corresponding labels for training tweets

validation_tweets_raw = SpDevTask3[1]      # Raw tweet texts from the validation set
validation_labels_raw = SpDevTask3[2]      # Corresponding labels for validation tweets

# === 3. Clean the tweets (e.g., remove URLs, mentions, special characters) ===
cleaned_training_tweets = clean_text(training_tweets_raw)
cleaned_validation_tweets = clean_text(validation_tweets_raw)

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform on training labels
encoded_training_labels = mlb.fit_transform(training_labels_raw)

# Transform validation labels using the same mlb (no refitting)
encoded_validation_labels = mlb.transform(validation_labels_raw)

print("Classes:", mlb.classes_)
print("Encoded shape:", encoded_training_labels.shape)


Classes: ['IDEOLOGICAL-INEQUALITY' 'MISOGYNY-NON-SEXUAL-VIOLENCE' 'OBJECTIFICATION'
 'SEXUAL-VIOLENCE' 'STEREOTYPING-DOMINANCE']
Encoded shape: (2513, 5)


In [83]:
# === 4. Encode string labels as multi-label binarized vectors ===
mlb = MultiLabelBinarizer()
encoded_training_labels = mlb.fit_transform(training_labels_raw)
encoded_validation_labels = mlb.transform(validation_labels_raw)

# === 5. Convert cleaned text data into TF-IDF features ===
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # unigrams and bigrams
X_train_tfidf = tfidf_vectorizer.fit_transform(cleaned_training_tweets)
X_val_tfidf = tfidf_vectorizer.transform(cleaned_validation_tweets)

# === 6. Train a One-vs-Rest Logistic Regression classifier for multi-label ===
logistic_regressor = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
logistic_regressor.fit(X_train_tfidf, encoded_training_labels)

# === 7. Predict and evaluate on the validation set ===
validation_predictions = logistic_regressor.predict(X_val_tfidf)

# Calculate and display custom evaluation metrics
compute_metrics(encoded_validation_labels, validation_predictions)

print("Note: This evaluation is performed on the full validation set, not just tweets predicted as YES by previous models.")

Accuracy (subset accuracy): 0.115
Macro F1: 0.665
Macro Precision: 0.681
Macro Recall: 0.651
Singular ICM value (overall label accuracy): 0.506
Note: This evaluation is performed on the full validation set, not just tweets predicted as YES by previous models.


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_

In [84]:
class_counts = encoded_training_labels.sum(axis=0)  # Sum over samples for each class

for cls, count in zip(mlb.classes_, class_counts):
    print(f"Class {cls}: {count} samples")

Class IDEOLOGICAL-INEQUALITY: 1460 samples
Class MISOGYNY-NON-SEXUAL-VIOLENCE: 1200 samples
Class OBJECTIFICATION: 1359 samples
Class SEXUAL-VIOLENCE: 820 samples
Class STEREOTYPING-DOMINANCE: 1656 samples


### Bert (Best performance in task 1.1)

In [85]:
def get_ids_spanish(ids):
    with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json", "r") as f:
        test_json = json.load(f)
    # Filter to only English tweets in test set (optional sanity check)
    tweets_all = [item for item in test_json.values() if item["lang"] == "en"]

    print(f"Total English tweets in test set: {len(tweets_all)}")

    # Filter only English test tweets where id_EXIST is in yes_ids
    yes_tweets = [
        tweet_data["tweet"]
        for tweet_data in test_json.values()
        if tweet_data["lang"] == "es" and tweet_data["id_EXIST"] in ids
    ]

    yes_en_ids = [
        tweet_data["id_EXIST"]
        for tweet_data in test_json.values()
        if tweet_data["lang"] == "es" and tweet_data["id_EXIST"] in ids
    ]

    print(f"Number of English tweets predicted YES: {len(yes_tweets)}")
    print(yes_en_ids[:5])
    print(yes_tweets[:5])
    
    return yes_en_ids, yes_tweets

In [86]:
# Load your BoW predictions from the JSON file
with open("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/Task 1.1/Bert/bert_task1_submission_Spanish.json", "r") as f:
    bert_results = json.load(f)

# Extract the IDs that were predicted as "YES"
yes_ids_bert = [entry["id"] for entry in bert_results if entry["value"] == "YES"]
yes_ids_bert = [id_.replace("id_", "") for id_ in yes_ids_bert]


print(yes_ids_bert[:5])

['500004', '500012', '500019', '500020', '500022']


In [87]:
# Assuming yes_ids_bert is a list of IDs to filter on
filtered_ids, filtered_tweets = get_ids_spanish(yes_ids_bert)

# Clean the filtered tweets
cleaned_filtered_tweets = clean_text(filtered_tweets)

# Transform tweets to TF-IDF features
X_filtered_tfidf = tfidf_vectorizer.transform(cleaned_filtered_tweets)

# Predict multi-label outputs (binary indicator arrays)
filtered_predictions = logistic_regressor.predict(X_filtered_tfidf)

# Convert binary predictions back to label sets (list of labels per tweet)
filtered_pred_labels = mlb.inverse_transform(filtered_predictions)

# Print first 5 tweets with their predicted labels
for tweet, labels in zip(filtered_tweets[:5], filtered_pred_labels[:5]):
    print(f"Tweet: {tweet}\nPredicted labels: {labels}\n")

# Save predictions to JSON (adjusted for multi-label format)
save_predictions_to_json(filtered_ids, filtered_pred_labels, "BoW_task3_submission_Spanish.json", lang="es")

Total English tweets in test set: 978
Number of English tweets predicted YES: 567
['500004', '500012', '500019', '500020', '500022']
['@jordirico Primero fue internet, luego el gamergate, la manosfera y su misoginia extrema sin que las plataformas movieran un dedo, los incel-asesinatos, la extrema derecha capitalizando el odio, la mimetización de estas ideas con las de un debate respetable y finalmente aquí estamos', '@dimplerrylover lo se pero literalmente la chica estaba siendo harassed like como puede sonreir asi', '@Fistroman1 @ElioGatsby Sin pruebas??Qué fue en público.Qué lo vimos todos, qué me estás contando.Que no me mezcles churras con merinas.Qué de aquí a que me vengas con el metoo y el notallmen queda poquito y no me apetece acabar discutiendo de verdad, q lo veo venir.', '@EstefaniaVeloz ...Con el proceso legal, no para hacer arder a una persona (No sé ni quienes sean en este caso). Hay mujeres que son víctimas pero también hay corruptas, violentas y victimarias. El movimi

### Merge files

In [88]:
filename_english = "BoW_task3_submission.json"
filename_spanish = "BoW_task3_submission_Spanish.json"
filename_merged = "BoW_task3_submission_merge.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
