# Liar Dataset

Implement preprocessing functions in this notebook depending on whatever classifier you're training and just import whichever functions you need into your own model notebook.

## General Imports

In [1]:
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


Labels \\
0 ~ false \\
1 ~ half-true \\
2 ~ mostly true \\
3 ~ true \\
4 ~ barely true \\

## SVM Pre-processing

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from glove_embedding import embed_glove, load_glove, retrieve_unk
from preprocessing import set_idx, binarize

In [3]:
svm_dataset = load_dataset('liar')

### Objective: Train 6 SVMs ('one-to-rest') to classify a Liar instance from 0-5

Split train/test datasets into 6 separate datasets, format labels in binary

In [4]:
train = svm_dataset['train']
val = svm_dataset['validation']
test = svm_dataset['test']

In [5]:
# Set of labels in LIAR

train_labels = []
val_labels = []
test_labels = []

# Create 6 binary datasets, one for each label
# labels = [0, 1, 2, 3, 4, 5]
# for i in labels:
#     kwargs = {'idx': i}
#     train_labels.append(train.map(set_idx, fn_kwargs=kwargs)['label'])
#     val_labels.append(val.map(set_idx, fn_kwargs=kwargs)['label'])
#     test_labels.append(test.map(set_idx, fn_kwargs=kwargs)['label'])

# Create 1 binarized dataset - True or False
train_labels.append(train.map(binarize)['label'])
val_labels.append(val.map(binarize)['label'])
test_labels.append(test.map(binarize)['label'])

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:  10%|▉         | 1000/10269 [00:00<00:01, 7946.33 examples/s]

0
1
2
0
1
3
4
1
1
2
2
1
0
2
4
1
3
4
1
2
0
2
2
1
4
0
2
1
0
2
3
4
0
2
2
3
3
3
5
3
0
1
5
5
0
1
5
0
5
3
4
4
0
3
4
1
1
4
0
2
0
2
3
2
0
5
3
0
4
4
3
4
2
2
2
3
0
4
0
1
3
1
0
0
3
1
2
2
4
0
0
3
2
1
4
4
3
2
3
2
0
5
2
2
0
3
1
3
1
0
2
1
1
4
3
2
2
5
5
0
2
2
3
2
2
3
2
0
5
1
2
2
2
4
3
0
3
5
4
2
0
4
2
2
1
3
0
1
0
4
2
4
0
1
5
3
0
1
1
4
4
4
1
1
0
4
0
5
2
1
3
1
1
2
1
0
2
1
3
3
3
3
1
1
2
0
1
1
2
0
2
2
5
1
3
2
0
0
1
5
2
4
0
4
0
4
0
0
0
3
3
0
3
2
4
3
3
0
1
1
2
1
3
4
2
3
3
4
1
1
2
1
2
2
1
5
2
4
4
3
0
3
2
2
1
1
3
4
0
1
1
2
2
4
0
1
3
3
0
2
4
5
0
3
5
4
0
1
2
1
1
3
1
1
1
0
4
1
4
0
3
1
2
0
4
4
1
2
0
0
5
3
2
4
4
4
5
0
2
2
5
2
0
5
2
1
0
1
1
3
3
4
4
1
4
1
3
5
1
4
4
4
3
4
0
1
1
0
4
4
5
2
0
3
2
1
1
2
1
2
2
2
1
3
1
4
1
1
3
1
0
2
3
5
5
1
4
3
1
1
2
0
4
0
2
0
3
1
1
1
1
1
2
5
4
5
4
2
4
0
1
4
3
2
1
3
1
1
4
0
3
5
3
4
5
1
3
4
2
0
4
5
1
1
1
0
5
4
3
1
2
4
3
0
1
3
1
1
2
0
2
2
5
4
2
2
1
2
4
1
0
0
4
3
4
2
1
1
4
3
3
0
0
5
5
2
0
4
2
5
2
0
3
3
1
1
3
1
4
1
0
1
4
3
4
1
2
0
3
0
5
2
4
5
0
1
0
1
4
0
0
0
1
4
5
0
0
3
2
3
0
4
1
1
2
4
1
2
3
1


Map:  49%|████▊     | 5000/10269 [00:00<00:00, 12754.00 examples/s]

2
1
5
4
1
0
5
1
4
2
0
0
2
0
3
0
4
4
0
2
2
2
4
3
1
1
1
2
4
4
5
3
3
3
5
2
4
3
0
0
1
3
4
1
0
2
0
5
1
5
2
4
1
3
0
5
4
2
3
4
1
4
4
2
2
1
3
1
0
2
0
4
0
1
0
2
2
4
4
1
1
3
1
5
5
1
3
1
2
4
0
4
0
1
1
2
5
4
5
3
0
2
3
1
1
3
0
2
0
2
5
3
2
0
2
4
0
2
3
2
0
0
2
1
0
5
2
0
0
2
2
4
2
1
5
1
0
4
2
5
1
4
3
3
1
4
2
2
3
3
0
2
4
4
4
4
3
1
4
3
3
3
1
1
0
4
2
1
0
3
0
4
4
4
2
0
4
0
4
3
0
4
4
1
4
5
2
4
2
3
4
0
1
2
1
3
0
3
1
3
3
4
3
0
2
0
0
0
1
3
2
3
0
4
4
5
1
2
2
3
3
0
2
0
2
1
2
4
5
2
4
1
2
5
3
1
0
4
3
0
3
3
1
5
0
2
3
1
0
2
1
2
1
0
3
2
2
1
1
4
2
0
4
3
0
0
5
2
2
2
5
5
0
2
1
3
4
3
0
0
0
2
0
4
2
0
2
4
5
3
3
1
2
3
4
0
4
2
2
3
2
3
5
2
2
3
4
5
1
5
3
4
2
3
3
0
4
0
5
3
2
0
4
3
2
1
1
3
1
4
4
3
0
2
5
0
0
4
2
0
3
1
4
3
0
2
1
0
5
2
2
0
0
4
2
0
1
2
2
0
1
3
0
0
2
3
3
3
2
3
0
1
4
2
3
1
2
0
1
1
0
1
0
1
2
1
5
0
0
2
2
3
2
0
0
1
2
4
2
2
2
0
0
2
2
0
2
3
0
0
4
3
0
2
5
0
0
4
3
1
2
1
0
3
5
4
0
5
4
4
4
3
0
1
2
3
1
1
5
1
1
2
1
5
3
1
1
1
4
5
3
5
4
0
0
2
0
5
1
5
5
0
0
0
0
1
1
1
3
1
0
0
3
1
1
2
2
2
0
2
4
3
0
3
3
0
4
1
0
2
1
2
5
0
3
4
1
4
0
1


Map:  68%|██████▊   | 7000/10269 [00:00<00:00, 14509.29 examples/s]

2
4
2
1
1
4
4
2
0
5
1
1
1
0
2
4
1
4
1
0
0
2
4
2
1
1
3
2
4
0
0
4
5
4
3
2
3
4
5
0
2
3
4
2
4
0
2
3
0
3
3
5
3
1
5
1
2
1
5
2
1
1
1
4
2
3
4
3
2
3
1
3
1
0
4
1
4
0
1
2
0
3
2
1
0
2
2
2
2
1
4
0
2
3
2
3
1
1
5
1
4
3
2
2
1
2
1
2
0
5
1
2
0
4
2
1
1
5
3
4
5
2
3
3
2
4
1
3
3
3
1
3
1
1
0
2
1
4
3
5
1
1
4
2
0
2
5
5
1
4
0
0
5
3
3
5
4
3
3
3
0
3
4
0
4
1
0
2
0
1
0
3
4
1
0
0
4
1
2
0
3
3
4
5
4
0
1
2
1
3
3
2
4
1
3
1
2
2
3
1
0
4
1
3
2
4
1
2
3
2
5
0
1
3
1
1
4
2
4
3
5
2
1
0
1
1
1
2
1
4
3
2
2
3
0
4
0
0
3
3
1
2
3
4
5
4
3
1
5
3
2
1
1
1
1
3
3
3
4
1
3
4
3
5
4
4
1
3
2
3
0
4
2
2
5
0
2
4
3
2
4
0
1
4
0
1
3
2
2
1
3
0
5
2
2
0
2
1
0
0
4
3
0
4
3
3
2
1
4
2
4
4
2
2
0
2
5
1
2
0
2
5
2
2
0
3
4
5
3
4
2
4
0
1
4
3
2
5
0
1
1
0
1
3
4
5
4
5
1
3
0
0
4
1
1
2
4
4
3
0
1
1
5
4
2
5
2
4
3
0
2
2
3
1
0
4
1
1
2
5
2
0
2
0
1
1
1
2
4
1
1
0
1
2
1
2
0
4
4
1
4
3
2
5
3
3
2
1
2
1
1
3
1
1
3
3
4
1
0
1
0
4
2
2
4
1
5
0
1
4
1
0
0
2
2
1
0
2
0
3
3
0
2
5
3
3
1
4
4
0
0
0
5
2
2
2
3
4
4
2
0
0
3
3
1
2
2
0
5
3
4
2
2
2
1
2
2
3
4
0
1
5
2
3
4
0
4
0
2
3
1
2
3
0
2
2
1
0
2
4


Map: 100%|██████████| 10269/10269 [00:00<00:00, 13961.17 examples/s]


2
1
4
1
5
1
2
2
3
0
4
4
4
5
3
4
1
3
1
2
1
2
1
0
3
3
1
2
2
2
3
3
4
1
4
1
3
2
1
5
5
3
3
3
4
2
4
1
4
3
3
1
0
2
3
3
5
1
2
3
1
0
3
4
2
1
5
2
1
0
1
3
2
2
4
0
4
0
1
3
4
5
2
4
2
5
5
2
1
4
4
2
4
4
4
1
2
0
0
0
0
3
2
3
1
2
1
3
0
5
3
0
2
3
2
3
4
3
0
1
3
2
1
1
2
0
5
1
4
4
2
0
0
1
4
3
3
2
4
5
3
4
4
2
5
4
0
1
1
1
3
3
5
2
5
2
0
3
2
2
3
4
0
5
0
0
3
1
4
1
3
4
3
3
0
2
4
1
0
2
0
0
2
1
2
5
0
3
2
3
4
3
4
3
4
1
0
4
0
4
0
2
5
0
1
1
1
3
0
3
3
4
4
1
0
0
1
2
1
1
4
3
4
2
3
4
0
4
0
1
1
3
0
1
4
5
0
5
5
1
2
0
0
1
2
5
2
3
2
4
2
4
4
4
2
1
0
4
4
3
0
1
5
2
2
4
1
1
0
2
2
0
4
5
0
2
3
1
4
0
0
3
4
3
5
0
5
3
1
3
0
1
4
2
4
0
2
2
3
0
0
0
1
0
3
3
4
4
1
3
0
4
1
2
2
1
5
0
3
2
5
2
3
2
2
4
4
1
1
2
4
2
2
0
0
1
4
3
0
2
1
2
4
5
2
3
2
3
2
5
3
5
1
0
3
4
0
1
0
2
3
2
0
0
1
1
2
4
1
2
5
0
2
1
3
2
4
4
2
3
0
0
2
5
1
5
4
0
3
0
1
3
1
0
0
0
4
2
0
3
1
2
0
3
0
4
2
0
3
0
2
2
3
2
1
5
1
5
2
0
4
3
0
4
0
3
0
2
2
1
0
5
4
5
5
2
5
0
0
0
3
1
0
4
4
4
2
3
2
3
5
4
4
0
5
1
3
5
5
5
0
5
0
3
4
4
0
0
1
3
4
1
3
2
5
0
0
2
5
2
0
0
0
2
5
4
1
2
4
0
2
1
4
4
5
2
1
2
0
3


Map: 100%|██████████| 1284/1284 [00:00<00:00, 16656.11 examples/s]


4
5
0
1
1
0
3
0
3
1
1
0
1
3
4
1
1
2
4
5
1
5
3
5
1
4
2
5
2
5
0
0
4
4
0
2
3
1
4
1
2
0
3
4
0
3
1
2
2
3
0
3
1
0
2
2
1
0
5
2
1
1
1
4
4
4
4
0
1
2
4
0
4
3
4
0
2
0
1
3
4
4
4
4
1
4
2
4
5
1
1
5
1
2
0
4
0
4
5
0
4
2
0
2
3
5
0
1
2
3
1
1
3
1
1
5
3
0
0
2
3
0
2
0
3
0
4
1
0
2
4
3
2
3
5
1
3
5
2
3
3
0
0
0
3
3
4
2
1
4
4
1
5
4
1
1
0
0
3
1
4
5
5
1
0
5
2
2
1
2
1
0
3
0
4
3
2
1
4
3
0
0
0
5
5
0
3
2
3
2
4
0
0
0
5
1
0
2
0
4
4
3
5
4
1
3
3
5
2
1
0
4
0
0
4
4
5
0
5
0
2
2
4
2
2
4
3
1
4
5
0
0
1
5
3
2
0
1
1
3
0
3
2
1
2
1
4
4
5
4
4
2
2
4
2
5
4
0
4
2
1
2
0
1
4
0
3
5
5
3
1
0
4
3
1
3
3
0
2
2
0
1
5
5
2
0
4
2
2
5
2
4
5
4
4
1
0
2
1
2
4
2
0
4
0
4
1
1
3
1
1
1
1
5
2
1
0
3
4
3
4
0
5
1
4
2
4
1
1
1
5
2
0
4
4
0
0
2
0
4
2
0
2
1
0
5
3
4
4
3
1
0
0
4
2
5
4
4
0
0
2
1
1
1
1
2
0
5
3
0
2
4
0
0
2
1
2
2
3
3
2
4
0
3
1
2
2
0
1
0
2
1
4
1
0
1
2
0
2
2
4
1
5
1
4
2
2
3
0
3
3
0
2
3
5
4
4
2
0
1
4
2
4
1
1
2
4
0
5
2
0
3
0
3
0
2
2
3
0
4
0
0
2
1
4
2
1
4
1
4
2
4
2
1
2
2
3
2
0
4
2
4
4
4
5
0
5
1
2
2
0
2
3
1
1
0
2
1
5
1
4
1
3
3
3
3
1
5
2
1
1
3
1
3
0
4
2
3
2
2


Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

3
0
0
1
5
3
3
4
3
4
4
4
5
0
1
3
5
1
3
0
2
1
5
2
0
3
4
1
3
2
0
0
1
1
3
1
4
2
3
3
4
1
5
4
2
4
1
2
0
4
1
4
2
1
0
1
2
1
5
5
4
4
3
0
1
0
2
2
3
3
1
1
4
2
0
1
0
0
1
4
5
1
3
0
1
1
4
2
0
3
4
3
4
5
3
5
3
0
4
4
3
4
2
3
3
1
5
5
3
3
5
0
1
2
3
1
0
4
1
0
4
4
5
4
0
2
1
2
1
3
5
2
0
5
4
2
4
2
2
0
0
0
2
3
0
3
2
1
1
0
0
2
4
1
0
3
0
3
5
2
2
1
1
5
2
0
4
2
2
4
4
0
2
1
3
5
4
3
1
3
3
3
2
1
1
0
0
2
0
3
2
4
4
0
4
1
1
4
3
2
0
0
0
5
2
0
5
2
5
0
4
4
0
5
3
1
4
4
1
4
4
2
0
3
2
2
2
2
2
0
4
1
4
3
4
3
2
1
4
2
1
4
1
0
4
2
2
0
2
1
1
1
4
1
2
3
2
4
0
2
3
0
0
2
4
0
3
3
3
2
5
3
3
2
0
2
5
1
1
0
2
5
1
0
4
0
4
2
4
1
1
0
2
0
0
1
5
3
1
4
0
4
2
0
0
3
0
2
1
2
4
1
5
2
2
0
3
3
1
5
1
2
4
1
1
2
0
3
1
2
3
1
0
5
2
2
0
0
4
0
3
0
3
2
1
1
0
1
5
4
0
4
2
1
3
5
0
4
4
5
1
0
2
3
1
3
2
0
3
4
3
2
1
3
3
1
3
3
2
1
4
1
1
4
1
3
1
0
2
2
3
3
3
0
3
4
3
0
0
3
4
1
2
3
5
5
2
0
0
1
1
4
4
3
2
4
2
0
4
1
4
3
2
0
3
3
1
3
4
2
3
4
2
2
0
5
2
2
2
0
4
2
3
2
1
1
3
5
5
4
4
1
3
3
5
0
1
1
2
0
2
2
1
5
3
2
4
0
3
2
2
2
5
0
1
0
2
4
4
0
4
1
1
5
1
2
4
0
2
5
3
4
5
1
0
0
1
5
1
0


Map: 100%|██████████| 1283/1283 [00:00<00:00, 15594.42 examples/s]

4
4
2
2
0
0
2
4
1
0
0
0
0
2
4
2
2
0
2
1
1
2
1
3
2
1
4
1
3
4
0
1
5
2
1
5
0
2
2
0
2
2
4
1
5
0
4
5
4
5
2
3
2
1
2
0
1
4
1
4
1
1
3
5
1
2
3
3
0
1
2
1
0
3
1
4
0
0
3
3
5
4
3
4
3
1
4
3
1
5
0
0
4
2
1
4
0
1
3
3
2
0
2
4
0
4
1
5
0
3
0
2
3
4
0
0
1
1
2
2
3
0
0
5
2
1
4
1
0
3
2
4
0
2
1
0
1
0
2
0
0
4
2
2
3
0
4
0
0
3
2
3
4
3
4
1
2
0
0
4
2
3
1
0
3
2
4
3
2
2
1
3
0
2
4
2
2
1
5
0
4
4
2
4
0
0
1
4
0
4
2
1
4
4
2
3
1
0
2
3
4
1
5
3
0
0
1
0
4
2
4
4
4
3
4
4
3
1
0
1
4
1
2
3
4
1
2
1
5
3
0
0
1
3
0
0
2
0
0
1
4
3
3
3
0
2
3
3
1
5
5
3
3
2
5
5
5
3
1
4
2
4
4
3
4
4
1
2
2
3
2
0
5
5
1
4
0
5
1
4
4
4
0





In [6]:
# Embedding training data w/ GLOVE    
embedding_dict = load_glove("glove.6B.50d.txt")

In [7]:
# Compute UNK embedding here as average of all embeddings in vocabulary
unknown_embedding = retrieve_unk(embedding_dict)

# Set arbitrary pad length for each sentence
pad_len = 15

# Get embedded training dataset
train_sentences = train['statement']
X_train = embed_glove(train_sentences, embedding_dict, pad_len, unknown_embedding)

# Get embedded test dataset
test_sentences = test['statement']
X_test = embed_glove(test_sentences, embedding_dict, pad_len, unknown_embedding)

In [8]:
# Fit SVMs
scores = []

for i in range(len(train_labels)):
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(X_train, train_labels[i])
    preds = clf.predict(X_test)
    scores.append(accuracy_score(test_labels[i], preds))

print(scores)

[0.6383476227591582]
