### Lnr Project Task 1.1 #  Logistic Regression + TF-IDF (Bag of words)

Niklas Dahlbom, ndahlbom@kth.se, ndahlbo@upv.edu.es

###Installs


In [None]:
!pip install transformers --upgrade
!pip install datasets accelerate --upgrade
!pip install peft --upgrade
!pip install jupyter --upgrade
!pip install ipywidgets --upgrade

###Imports

In [1]:
from readerEXIST2025 import EXISTReader
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

### Read datasets

In [2]:
reader_train = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json")
reader_dev = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json")
reader_test = EXISTReader("/Users/niklasdahlbom/Documents/Valencia/Lnr/Project/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json")

EnTrainTask1, EnDevTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1")
SpTrainTask1, SpDevTask1 = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1")

SpTestTask1, EnTestTask1 = reader_test.get(lang="ES", subtask="1", include_ambiguous=True),  reader_test.get(lang="EN", subtask="1", include_ambiguous=True)

print(EnTrainTask1[1])
print("-------------------")

1       Writing a uni essay in my local pub with a cof...
2       @UniversalORL it is 2021 not 1921. I dont appr...
5       According to a customer I have plenty of time ...
6       So only 'blokes' drink beer? Sorry, but if you...
7       New to the shelves this week - looking forward...
                              ...                        
3255    idk why y’all bitches think having half your a...
3256    This has been a part of an experiment with @Wo...
3257    "Take me already" "Not yet. You gotta be ready...
3258    @clintneedcoffee why do you look like a whore?...
3259    ik when mandy says “you look like a whore” i l...
Name: text, Length: 2870, dtype: object
-------------------


In [3]:
from collections import Counter

label_counts = Counter(EnTrainTask1[2])
print("NO:", label_counts["NO"])
print("YES:", label_counts["YES"])

NO: 1733
YES: 1137


### Preprocessing

In [4]:
def clean_text(text_list):
    cleaned_corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"https?://\S+", "", text)  # Removes URLs
        text = re.sub(r"@\w+", "", text)          # Removes mentions
        text = text.replace("#", "")              # Removes Hashtags
        text = re.sub(r"\s+", " ", text).strip()   # Removes spaces
        cleaned_corpus.append(text)
    return cleaned_corpus

### Training and Evaluation

### Baseline: Logistic Regression + TF-IDF (Bag of words)

In [5]:
!pip install scikit-learn



In [5]:
def compute_metrics(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0, pos_label=1
    )
    acc = accuracy_score(y_true, y_pred)

    print(f"Accuracy: {acc:.3f}")
    print(f"Binary F1: {f1:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Example: assuming `tweets` = list of tweet texts, and `labels` = list of YES/NO
tweets = EnTrainTask1[1]
labels = EnTrainTask1[2]

tweets_clean = clean_text(tweets)  # Your cleaning function here

# Convert YES/NO to 1/0
labels_bin = [1 if l == "YES" else 0 for l in labels]

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    tweets_clean, labels_bin, test_size=0.2, random_state=42
)

# TF-IDF + Logistic Regression pipeline
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Evaluation
y_pred = clf.predict(X_val_tfidf)

# Compute and print custom metrics (main focus)
compute_metrics(y_val, y_pred)

Accuracy: 0.768
Binary F1: 0.645
Precision: 0.786
Recall: 0.548


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


### Save to cvs

In [7]:
# 1. Prepare and clean the test data
test_ids = EnTestTask1[0]
test_texts = EnTestTask1[1]
test_clean = clean_text(test_texts)  # Reuse your cleaning function

# 2. Transform test data using trained vectorizer
X_test_tfidf = vectorizer.transform(test_clean)

# 3. Predict test labels
test_preds_bin = clf.predict(X_test_tfidf)

# 4. Convert binary predictions back to YES/NO
test_preds_label = ["YES" if p == 1 else "NO" for p in test_preds_bin]

# 5. Create DataFrame for submission
submission_df = pd.DataFrame({
    "id": test_ids,
    "label": test_preds_label,
    "test_case": ["EXIST2025"] * len(test_ids)
})

# 6. Save to CSV
submission_df.to_csv("BoW_predictions_task1.csv", index=False)
print("Saved predictions to BoW_predictions_task1.csv ✅")


Saved predictions to BoW_predictions_task1.csv ✅


### Save to json

In [10]:
import json

# Load your CSV results
df = pd.read_csv("BoW_predictions_task1.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("BoW_task1_submission.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to task1_submission.json ✅")


Saved to task1_submission.json ✅


### Spanish

In [11]:
# Example: assuming `tweets` = list of tweet texts, and `labels` = list of YES/NO
tweets = SpTrainTask1[1]
labels = SpTrainTask1[2]

tweets_clean = clean_text(tweets)  # Your cleaning function here

# Convert YES/NO to 1/0
labels_bin = [1 if l == "YES" else 0 for l in labels]

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    tweets_clean, labels_bin, test_size=0.2, random_state=42
)

# TF-IDF + Logistic Regression pipeline
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Evaluation
y_pred = clf.predict(X_val_tfidf)

# Compute and print custom metrics (main focus)
compute_metrics(y_val, y_pred)

Accuracy: 0.740
Binary F1: 0.710
Precision: 0.738
Recall: 0.684


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


### Save to csv

In [12]:
# 1. Prepare and clean the test data
test_ids = SpTestTask1[0]
test_texts = SpTestTask1[1]
test_clean = clean_text(test_texts)  # Reuse your cleaning function

# 2. Transform test data using trained vectorizer
X_test_tfidf = vectorizer.transform(test_clean)

# 3. Predict test labels
test_preds_bin = clf.predict(X_test_tfidf)

# 4. Convert binary predictions back to YES/NO
test_preds_label = ["YES" if p == 1 else "NO" for p in test_preds_bin]

# 5. Create DataFrame for submission
submission_df = pd.DataFrame({
    "id": test_ids,
    "label": test_preds_label,
    "test_case": ["EXIST2025"] * len(test_ids)
})

# 6. Save to CSV
submission_df.to_csv("BoW_predictions_task1_Spanish.csv", index=False)
print("Saved predictions to BoW_predictions_task1_Spanish.csv ✅")

Saved predictions to BoW_predictions_task1_Spanish.csv ✅


### Save to json

In [13]:
import json

# Load your CSV results
df = pd.read_csv("BoW_predictions_task1_Spanish.csv")

# Create a list of dictionaries in the required format
results_json = []
for _, row in df.iterrows():
    result = {
        "id": f"{row['id']}",  # add prefix 'id_' as required
        "value": row["label"],    # if you only have hard outputs (YES/NO)
        "test_case": row["test_case"]
    }
    results_json.append(result)

# Save to JSON file
with open("BoW_task1_submission_Spanish.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to task1_submission_Spanish.json ✅")


Saved to task1_submission_Spanish.json ✅


### Merge results

In [14]:
filename_english = "BoW_task1_submission.json"
filename_spanish = "BoW_task1_submission_Spanish.json"
filename_merged = "BoW_task1_submission_merged.json"

def merge_predictions(filename_english, filename_spanish, filename_merged):
    # Load English predictions
    with open(filename_english, "r", encoding="utf-8") as f_en:
        preds_en = json.load(f_en)

    # Load Spanish predictions
    with open(filename_spanish, "r", encoding="utf-8") as f_es:
        preds_es = json.load(f_es)

    # Merge the two lists
    merged_preds = preds_es + preds_en

    # Save the combined predictions
    with open(filename_merged, "w", encoding="utf-8") as f_out:
        json.dump(merged_preds, f_out, ensure_ascii=False, indent=2)

    print(f"Merged {len(preds_en)} EN + {len(preds_es)} ES = {len(merged_preds)} total predictions.")
    
    
merge_predictions(
    filename_english,
    filename_spanish,
    filename_merged
)

Merged 978 EN + 1098 ES = 2076 total predictions.
