## Bag of Words (Simple Baseline Model)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [None]:
# Load datasets
train_dataset = pd.read_csv("../dataset/dpm_pcl_train.csv")
val_dataset = pd.read_csv("../dataset/dpm_pcl_val.csv")
test_dataset = pd.read_csv("../dataset/dpm_pcl_test.csv")

# Preprocess labels
label_mapping = lambda x: 0 if (x == 0 or x == 1) else 1
train_dataset["label"] = train_dataset["orig_label"].apply(label_mapping)
val_dataset["label"] = val_dataset["orig_label"].apply(label_mapping)
test_dataset["label"] = test_dataset["orig_label"].apply(label_mapping)

# Handle missing values
train_dataset["text"].fillna("", inplace=True)
val_dataset["text"].fillna("", inplace=True)
test_dataset["text"].fillna("", inplace=True)

In [None]:
# Extract features using BoW
vectorizer = CountVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_dataset["text"])
X_val = vectorizer.transform(val_dataset["text"])
X_test = vectorizer.transform(test_dataset["text"])

y_train = train_dataset["label"]
y_val = val_dataset["label"]
y_test = test_dataset["label"]

In [None]:
# Train a logistic regression classifier
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [None]:
# Evaluate on validation set
val_preds = clf.predict(X_val)
print("Validation Performance:")
print(classification_report(y_val, val_preds, digits=4))

# Evaluate on test set
test_preds = clf.predict(X_test)
print("Test Performance:")
print(classification_report(y_test, test_preds, digits=4))

# Calculate F1 scores explicitly
val_f1 = f1_score(y_val, val_preds)
test_f1 = f1_score(y_test, test_preds)
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")