Similar imports as before.

In [1]:
%matplotlib notebook
import pandas
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import time

In [2]:
truth_text_mapping = {
    'pants-fire': 0,
    'false': 1,
    'barely-true': 2,
    'half-true': 3,
    'mostly-true': 4,
    'true': 5
}


class Statement:
    def __init__(self, body, speaker, value, context):
        self.body = body
        self.speaker = speaker
        self.value = truth_text_mapping[value]
        self.context = context

    @staticmethod
    def from_row(row):
        return Statement(value=row[1],
                         body=row[2],
                         speaker=row[4],
                         context=row[13])

    def __repr__(self):
        arg_str = str(', '.join(['='.join([i[0], repr(i[1])])
                                 for i in vars(self).items()]))
        return "Statement({})".format(arg_str)

    def __str__(self):
        return repr(self)

    @property
    def features(self):
        return ' '.join([self.speaker, self.context, self.body])


import csv


def load_liar_data(path):
    statements = []
    with open(path) as data_file:
        reader = csv.reader(data_file, delimiter='\t', quotechar='"')
        for row in reader:
            try:
                statements.append(Statement.from_row(row))
            except IndexError:
                print(row, len(row))
    return statements


statements = load_liar_data("../datasets/LIAR/train.tsv")
# print out some statements to verify by eye.
len(statements)

10241

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

x = vectorizer.fit_transform([s.features for s in statements])
y = np.array([s.value for s in statements]).ravel()

# vocab
# statements[0].body
print('Vocab size:', x.shape)
x

Vocab size: (10241, 14552)


<10241x14552 sparse matrix of type '<class 'numpy.int64'>'
	with 221431 stored elements in Compressed Sparse Row format>

In [4]:
clf = linear_model.Ridge(fit_intercept=True, alpha=0.01)
clf.fit(x, y)
clf.score(x, y)

0.73764421048562223

In [5]:
test_statements = load_liar_data("../datasets/LIAR/test.tsv")
x_test = vectorizer.transform([s.body for s in test_statements])
y_test = np.array([s.value for s in test_statements]).ravel()
clf.score(x_test, y_test)

-0.15355250207925675

In [6]:
# print out some predicted vs. actual values
list(zip(clf.predict(x_test), y_test[:20]))

[(2.3190148093157847, 5),
 (2.5264375090142104, 1),
 (1.9545609280672682, 1),
 (2.2104322327193056, 3),
 (0.74627427433243954, 0),
 (3.2444147393076901, 5),
 (4.3541678624691773, 5),
 (0.55351543858837626, 2),
 (2.0689796475251283, 5),
 (4.1465117319822618, 2),
 (2.3783477202740246, 2),
 (0.57094086590817872, 2),
 (2.5052177980042387, 0),
 (2.3567443396935608, 1),
 (2.221714769257753, 3),
 (2.920319039580844, 5),
 (2.677090818797252, 0),
 (2.5799909333286504, 3),
 (2.3387620984333815, 5),
 (3.1518270417649581, 1)]

In [7]:
# find optimal alpha for linear Ridge model
best = None
for al in np.logspace(-2, 1, 10):
    clf.set_params(alpha=al)
    clf.fit(x, y)
    s = clf.score(x_test, y_test)
    if best is None or s > best[1]:
        best = (al, s)
    print(al, s)
print(best)

0.01 -0.153886870728
0.0215443469003 -0.151199543725
0.0464158883361 -0.147757688471
0.1 -0.141873438231
0.215443469003 -0.133104059989
0.464158883361 -0.117215704741
1.0 -0.0899746504741
2.15443469003 -0.0514309824428
4.64158883361 -0.00833071742524
10.0 0.0292901659787
(10.0, 0.029290165978745786)


In [8]:
clf.set_params(alpha=best[0])
clf.fit(x, y)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [9]:
clf.score(x_test, y_test)

0.029240617130987471

In [10]:
# print out some predicted vs. actual values
list(zip(clf.predict(x_test), y_test[-20:]))

[(2.3209795324701092, 5),
 (2.6715138179302662, 2),
 (2.0496519548811736, 2),
 (2.2188680457871435, 3),
 (1.3843752973352774, 4),
 (2.9208670822019425, 4),
 (3.6224864057170159, 5),
 (1.2337969189914391, 4),
 (2.3075057959753473, 1),
 (3.5117613394166924, 0),
 (2.5199625546045241, 0),
 (1.5839646421350715, 3),
 (2.5158981655332973, 2),
 (2.8834155462485995, 1),
 (2.635620040189552, 0),
 (2.8464316728044619, 3),
 (2.4783848179009453, 2),
 (2.3252019983155332, 2),
 (2.5958912519461395, 2),
 (3.1956225085754912, 1)]

Quantify the performance of other models.

In [11]:
from sklearn.svm import LinearSVC
model = LinearSVC
clf = model()
clf.fit(x, y)
clf.score(x_test, y_test)

0.22494080505130229

In [12]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
clf = GaussianNB()
clf.fit(x.toarray(), y)
clf.score(x_test.toarray(), y_test)

0.17442778216258878

In [13]:
clf = MultinomialNB()
clf.fit(x, y)
clf.score(x_test, y_test)

0.23441199684293607

In [14]:
clf = BernoulliNB()
clf.fit(x, y)
clf.score(x_test, y_test)

0.24704025256511444

In [15]:
clf = linear_model.Lasso(alpha=0.1)
clf.fit(x, y)
print(al, clf.score(x_test, y_test))

10.0 0.000830238172206


# Conclusion

The paper achieved similar accuracy to my experiments here at about 23%.