In [1]:
%load_ext autoreload
%autoreload 2

In [98]:
from transformers import pipeline
from datasets import load_dataset
from sibyl import *
import random

### Dataset

In [20]:
dataset = load_dataset("glue", "sst2", split="train")
dataset = dataset.rename_column("sentence", "text")
original_text, original_labels = dataset['text'], dataset['label']

Reusing dataset glue (C:\Users\fabri\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


### Model

In [4]:
pipe = pipeline(task="sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


### Transform

In [5]:
transform = ChangeAntonym(task_name="sentiment", return_metadata=True)

### Feature 

In [6]:
def contains_question(s):
    return "?" in s

### Analysis

In [7]:
from sibyl import acc_at_k
from sklearn.metrics import accuracy_score

In [93]:
def extract_probs(results):
    return np.array([[1-r['score'], r['score']] if r['label'] == "POSITIVE" else [r['score'], 1-r['score']] for r in results])

def compute_accuracy(predictions, labels):
    if len(labels.shape) > 1:
        acc = acc_at_k(labels, predictions, k=2)       
    else:
        acc = accuracy_score(labels, np.argmax(predictions, -1))
    return acc

In [109]:
qs, nqs = [], []

for s, l in zip(original_text, original_labels):
    if contains_question(s):
        qs.append((s, l))
    else:
        nqs.append((s,l))
        
q_text, q_labels = zip(*qs)
nq_text, nq_labels = zip(*random.sample(nqs, len(q_text)))

q_text, q_labels = list(q_text), one_hot_encode(q_labels, 2)
nq_text, nq_labels = list(nq_text), one_hot_encode(q_labels, 2)

print(f"number of inputs with questions: {len(q_text)}")
print(f"number of inputs without questions: {len(nq_text)}")

number of inputs with questions: 186
number of inputs without questions: 186


In [110]:
transformed_q_text, transformed_q_labels = transform.transform_batch(batch=(q_text, q_labels))
transformed_nq_text, transformed_nq_labels = transform.transform_batch(batch=(nq_text, nq_labels))

transformed_q_labels = np.array(transformed_q_labels).squeeze()
transformed_nq_labels = np.array(transformed_nq_labels).squeeze()

In [111]:
original_q_preds = extract_probs(pipe(q_text))
original_nq_preds = extract_probs(pipe(nq_text))

transformed_q_preds = extract_probs(pipe(transformed_q_text))
transformed_nq_preds = extract_probs(pipe(transformed_nq_text))

In [112]:
original_q_acc = compute_accuracy(original_q_preds, np.argmax(q_labels, -1))
original_nq_acc = compute_accuracy(original_nq_preds, np.argmax(nq_labels, -1))

transformed_q_acc = compute_accuracy(transformed_q_preds, transformed_q_labels)
transformed_nq_acc = compute_accuracy(transformed_nq_preds, transformed_nq_labels)

In [114]:
print(f"original_q_acc: \t{round(original_q_acc, 2)}")
print(f"original_nq_acc: \t{round(original_nq_acc, 2)}")
print(f"transformed_q_acc: \t{round(transformed_q_acc, 2)}")
print(f"transformed_nq_acc: \t{round(transformed_nq_acc, 2)}")

original_q_acc: 	1.0
original_nq_acc: 	0.46
transformed_q_acc: 	0.87
transformed_nq_acc: 	0.6
