In [2]:
import datasets
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")
processed_data=datasets.load_dataset("victorambrose11/final_preprocessed_scotus")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import Word2Vec
import fasttext
import tempfile
import os
import pandas as pd
import numpy as np


In [4]:
original_data = {
    "train": dataset["train"].to_pandas(),
    "validation": dataset["validation"].to_pandas(),
    "test": dataset["test"].to_pandas()
}

cleaned_data = {
    "train": processed_data["train"].to_pandas(),
    "validation": processed_data["validation"].to_pandas(),
    "test": processed_data["test"].to_pandas()
}

In [5]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "micro_f1": f1_score(y_test, y_pred, average='micro'),
        "macro_f1": f1_score(y_test, y_pred, average='macro')
    }


In [6]:
def run_tfidf_svm(data):
    pipe = Pipeline([
        ("vec", TfidfVectorizer(max_features=10000)),
        ("clf", LinearSVC())
    ])
    return evaluate_model("TF-IDF + SVM", pipe,
                          data["train"]["text"], data["train"]["label"],
                          data["test"]["text"], data["test"]["label"])

def run_tfidf_logreg(data):
    pipe = Pipeline([
        ("vec", TfidfVectorizer(max_features=10000)),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    return evaluate_model("TF-IDF + LogReg", pipe,
                          data["train"]["text"], data["train"]["label"],
                          data["test"]["text"], data["test"]["label"])


In [7]:
def run_fasttext(data):
    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as train_file, \
         tempfile.NamedTemporaryFile(mode='w+', delete=False) as test_file:

        for text, label in zip(data["train"]["text"], data["train"]["label"]):
            train_file.write(f"__label__{label} {text}\n")

        for text, label in zip(data["test"]["text"], data["test"]["label"]):
            test_file.write(f"__label__{label} {text}\n")

    model = fasttext.train_supervised(input=train_file.name, epoch=25, lr=1.0, wordNgrams=2, verbose=0)
    preds = [int(model.predict(text)[0][0].replace("__label__", "")) for text in data["test"]["text"]]

    os.unlink(train_file.name)
    os.unlink(test_file.name)

    return {
        "model": "fastText",
        "accuracy": accuracy_score(data["test"]["label"], preds),
        "micro_f1": f1_score(data["test"]["label"], preds, average='micro'),
        "macro_f1": f1_score(data["test"]["label"], preds, average='macro')
    }


In [8]:
def run_word2vec_logreg(data):
    tokenized_train = [text.split() for text in data["train"]["text"]]
    tokenized_test = [text.split() for text in data["test"]["text"]]

    model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=4)

    def embed(docs):
        embeddings = []
        for doc in docs:
            vecs = [model.wv[word] for word in doc if word in model.wv]
            if vecs:
                embeddings.append(np.mean(vecs, axis=0))
            else:
                embeddings.append(np.zeros(100))
        return np.vstack(embeddings)

    X_train = embed(tokenized_train)
    X_test = embed(tokenized_test)

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, data["train"]["label"])
    y_pred = clf.predict(X_test)

    return {
        "model": "Word2Vec + LogReg",
        "accuracy": accuracy_score(data["test"]["label"], y_pred),
        "micro_f1": f1_score(data["test"]["label"], y_pred, average='micro'),
        "macro_f1": f1_score(data["test"]["label"], y_pred, average='macro')
    }


In [10]:
results_original = [
    run_tfidf_svm(original_data),
    run_tfidf_logreg(original_data),
    run_word2vec_logreg(original_data)
]

results_cleaned = [
    run_tfidf_svm(cleaned_data),
    run_tfidf_logreg(cleaned_data),
    run_word2vec_logreg(cleaned_data)
]

# Format results for display
df_results = pd.DataFrame(results_original + results_cleaned)
df_results["dataset"] = ["original"] * 3 + ["preprocessed"] * 3
df_results = df_results[["dataset", "model", "accuracy", "micro_f1", "macro_f1"]]
df_results


Unnamed: 0,dataset,model,accuracy,micro_f1,macro_f1
0,original,TF-IDF + SVM,0.734286,0.734286,0.62241
1,original,TF-IDF + LogReg,0.682857,0.682857,0.452079
2,original,Word2Vec + LogReg,0.603571,0.603571,0.411642
3,preprocessed,TF-IDF + SVM,0.627143,0.627143,0.480467
4,preprocessed,TF-IDF + LogReg,0.598571,0.598571,0.344033
5,preprocessed,Word2Vec + LogReg,0.419286,0.419286,0.215896
