In [1]:
%matplotlib notebook
import pandas
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import time

In [2]:
truth_text_mapping = {
    'pants-fire':0,
    'false':1,
    'barely-true':2,
    'half-true':3,
    'mostly-true':4,
    'true':5
}
class Statement:
    def __init__(self, body, speaker, value,context):
        self.body = body
        self.speaker = speaker
        self.value = truth_text_mapping[value]
        self.context = context
    
    
    @staticmethod
    def from_row(row):
        return Statement(value=row[1],
                         body=row[2],
                         speaker=row[4],
                         context=row[13])
    
    
    def __repr__(self):
        arg_str =  str(', '.join(['='.join([i[0],repr(i[1])]) for i in vars(self).items()]))
        return "Statement({})".format(arg_str)
    
    
    def __str__(self):
        return repr(self)
    
import csv

def load_liar_data(path):
    statements = []
    with open(path) as data_file:
        reader = csv.reader(data_file, delimiter='\t', quotechar='"')
        for row in reader:
            try:
                statements.append(Statement.from_row(row))
            except IndexError:
                print(row,len(row))
    return statements
statements = load_liar_data("../datasets/LIAR/train.tsv") 
# print out some statements to verify by eye.
len(statements)

10241

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

x = vectorizer.fit_transform([s.body for s in statements])
y = np.array([s.value for s in statements]).ravel()

# vocab
# statements[0].body
print('Vocab size:', x.shape)
x

Vocab size: (10241, 12193)


<10241x12193 sparse matrix of type '<class 'numpy.int64'>'
	with 165946 stored elements in Compressed Sparse Row format>

In [4]:
clf = linear_model.Ridge(fit_intercept=True,alpha=0.01)
clf.fit(x,y)
clf.score(x,y)

0.69208850272798461

In [5]:
test_statements = load_liar_data("../datasets/LIAR/test.tsv")
x_test = vectorizer.transform([s.body for s in test_statements])
y_test = np.array([s.value for s in test_statements]).ravel()
clf.score(x_test, y_test)

-0.25244821760076208

In [6]:
# print out some predicted vs. actual values
list(zip(clf.predict(x_test),y_test[:20]))

[(2.0472749806946871, 5),
 (1.8261264104265367, 1),
 (1.9000081939441396, 1),
 (1.8558205728828341, 3),
 (-0.28196753738752323, 0),
 (3.0690968994741619, 5),
 (2.6283695546077488, 5),
 (-0.052834952638648236, 2),
 (3.6759680743491745, 5),
 (3.436455107929925, 2),
 (2.2065671404383846, 2),
 (0.17781644197468127, 2),
 (1.8644100543699185, 0),
 (2.2102025252411828, 1),
 (2.0172950523918116, 3),
 (2.907837241061034, 5),
 (2.7933826170026697, 0),
 (3.2269402540345919, 3),
 (2.7893831267023552, 5),
 (3.0685015906353001, 1)]

In [7]:
# find optimal alpha for linear Ridge model
best = None
for al in np.logspace(-2,1,10):
    clf.set_params(alpha=al)
    clf.fit(x,y)
    s = clf.score(x_test, y_test)
    if best is None or s > best[1]:
        best = (al, s)
    print(al, s)
print(best)

0.01 -0.252391607079
0.0215443469003 -0.25096959462
0.0464158883361 -0.247239113892
0.1 -0.239322771602
0.215443469003 -0.224873467611
0.464158883361 -0.19546217465
1.0 -0.141865218691
2.15443469003 -0.0776402248728
4.64158883361 -0.0169027384881
10.0 0.0265075093176
(10.0, 0.026507509317618139)


In [8]:
clf.set_params(alpha=best[0])
clf.fit(x,y)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [9]:
clf.score(x_test,y_test)

0.026455525070466557

In [10]:
# print out some predicted vs. actual values
list(zip(clf.predict(x_test),y_test[-20:]))

[(1.9850047380894333, 5),
 (2.3123691664532631, 2),
 (2.063866140562939, 2),
 (1.8996995282554476, 3),
 (0.83674901774951826, 4),
 (2.5424982396365383, 4),
 (2.9507323648308996, 5),
 (1.2123083357129312, 4),
 (3.5090653687138569, 1),
 (2.8730446033567505, 0),
 (2.5370943959479089, 0),
 (1.430908582682233, 3),
 (2.2927058622837384, 2),
 (2.6513606528981293, 1),
 (2.6868274802971746, 0),
 (2.7842100963194198, 3),
 (2.3475649551525319, 2),
 (2.61991345736971, 2),
 (2.748466602966269, 2),
 (3.1766634228519992, 1)]

In [11]:
from sklearn.svm import LinearSVC
model = LinearSVC
clf = model()
clf.fit(x,y)
clf.score(x_test,y_test)


0.23283346487766376

In [12]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
clf = GaussianNB()
clf.fit(x.toarray(),y)
clf.score(x_test.toarray(),y_test)


0.18310970797158643

In [13]:
clf = MultinomialNB()
clf.fit(x,y)
clf.score(x_test,y_test)


0.24388318863456984

In [14]:
for al in np.logspace(-3,-2,10):
    clf = linear_model.Lasso(alpha = al)
    clf.fit(x,y)
    print(al, clf.score(x_test,y_test))

0.001 0.0673758755054
0.00129154966501 0.0718589017622
0.0016681005372 0.0718275178189
0.00215443469003 0.0693462141626
0.00278255940221 0.0667271790784
0.0035938136638 0.0622470096344
0.00464158883361 0.0556508909774
0.00599484250319 0.0507454206886
0.00774263682681 0.0463782286907
0.01 0.0413154529219
