In [1]:
%load_ext autoreload
%autoreload 2

# TODO: Think about plotting subset sizes against predicted probability? (confidence)

DATASET_NAME = "toxigen"
LABEL_SPACE = ["non-toxic", "toxic"]
MODEL_NAME = "deberta-large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

In [2]:
# load embeddings
from data.embeddings import load_saved_embeddings
import numpy as np
train_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])


# load classifier
from data.models import load_saved_wrapperbox_model
knn_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="KNN"
)

svm_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="SVM",
)

dt_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="DecisionTree",
)

lmeans_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="LMeans",
)

# load labels 
from data.datasets import load_dataset_from_hf, load_labels_at_split
dataset = load_dataset_from_hf(dataset=DATASET_NAME)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful


In [3]:
from pprint import pprint
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from utils.inference import compute_metrics

l2 = 500
logit_clf = LogisticRegression(penalty="l2", C= 1 / l2)

logit_clf.fit(train_eval_embeddings, train_eval_labels)

predictions = logit_clf.predict(test_embeddings)

# Print some metrics
testset_perfm = compute_metrics(
    y_true=test_labels, y_pred=predictions, is_multiclass=False, prefix="test"
)
pprint(testset_perfm)
print(classification_report(y_true=test_labels, y_pred=predictions))

{'test_accuracy': 0.8202127659574469,
 'test_f1': 0.6817325800376648,
 'test_precision': 0.7327935222672065,
 'test_recall': 0.6373239436619719}
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       656
           1       0.73      0.64      0.68       284

    accuracy                           0.82       940
   macro avg       0.79      0.77      0.78       940
weighted avg       0.82      0.82      0.82       940



In [6]:
from classifiers.KMeansClassifier import KMeansClassifier
clf = KMeansClassifier(
    n_clusters=2, init='k-means++', random_state=42, algorithm='elkan'
)
clf.fit(train_eval_embeddings, train_eval_labels)
predictions = clf.predict(test_embeddings)

from pprint import pprint
from sklearn.metrics import classification_report
from utils.inference import compute_metrics

# Print some metrics
testset_perfm = compute_metrics(
    y_true=test_labels, y_pred=predictions, is_multiclass=False, prefix="test"
)
pprint(testset_perfm)
print(classification_report(y_true=test_labels, y_pred=predictions))

import pickle
# Save model to file
model_filename = 'LMeans.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(clf, f)

{'test_accuracy': 0.8223404255319149,
 'test_f1': 0.7023172905525846,
 'test_precision': 0.7111913357400722,
 'test_recall': 0.6936619718309859}
              precision    recall  f1-score   support

           0       0.87      0.88      0.87       656
           1       0.71      0.69      0.70       284

    accuracy                           0.82       940
   macro avg       0.79      0.79      0.79       940
weighted avg       0.82      0.82      0.82       940

