# Liar Dataset

Implement preprocessing functions in this notebook depending on whatever classifier you're training and just import whichever functions you need into your own model notebook.

## General Imports

In [1]:
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


Labels \\
0 ~ false \\
1 ~ half-true \\
2 ~ mostly true \\
3 ~ true \\
4 ~ barely true \\

## SVM Pre-processing

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from glove_embedding import embed_glove, load_glove, retrieve_unk
from preprocessing import set_idx

In [3]:
svm_dataset = load_dataset('liar')

### Objective: Train 6 SVMs ('one-to-rest') to classify a Liar instance from 0-5

Split train/test datasets into 6 separate datasets, format labels in binary

In [30]:
train = svm_dataset['train']
val = svm_dataset['validation']
test = svm_dataset['test']

In [None]:
# Set of labels in LIAR
labels = [0, 1, 2, 3, 4, 5]

train_labels = []
val_labels = []
test_labels = []

# Create 6 binary datasets, one for each label
for i in labels:
    kwargs = {'idx': i}
    train_labels.append(train.map(set_idx, fn_kwargs=kwargs)['label'])
    val_labels.append(val.map(set_idx, fn_kwargs=kwargs)['label'])
    test_labels.append(test.map(set_idx, fn_kwargs=kwargs)['label'])

In [None]:
# Embedding training data w/ GLOVE    
embedding_dict = load_glove("glove.6B.50d.txt")

In [28]:
# Compute UNK embedding here as average of all embeddings in vocabulary
unknown_embedding = retrieve_unk(embedding_dict)

# Set arbitrary pad length for each sentence
pad_len = 15

# Get embedded training dataset
train_sentences = train['statement']
X_train = embed_glove(train_sentences, embedding_dict, pad_len, unknown_embedding)

# Get embedded test dataset
test_sentences = test['statement']
X_test = embed_glove(test_sentences, embedding_dict, pad_len, unknown_embedding)

In [45]:
# Fit SVMs
scores = []

for i in range(len(train_labels)):
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(X_train, train_labels[i])
    preds = clf.predict(X_test)
    scores.append(accuracy_score(test_labels[i], preds))

print(scores)

[0.8051441932969603, 0.7918939984411536, 0.8059236165237724, 0.8355416991426344, 0.8332034294621979, 0.9282930631332814]
