# LIAR DETECTION GROUP PROJECT - Neural BOW Models  


### CONTENTS  

Imports  
Load ISOT data from appropriate pickle file  
Load ISOT vocabulary from pickle file  (note: vocab contains both "title" and "text" words)  
Train/Dev/Test split ISOT data  
Load LIWC data for custom features  
Load LIAR data (for evaluating models)  

#### Neural BOW Models:
- Model_1: Initial run replicating settings from Assignment 2, but with ISOT "title" data.  
- Model_2: Use GloVe word embeddings rather than initializing embeddings with uniform random numbers.  
- Model_3: Random word embeddings, but custom LIWC features concatenated into the model. 
- Model_4: Incorporate GloVe embeddings as well as LIWC features. Still training with ISOT "title" data. 
- Model_5: Train using LIAR/Politifact data, GloVe embeddings, but NO LIWC features; then predict on same plus ISOT "title" data also.  
- Model_6: Train using LIAR/Politifact data, GloVe embeddings, and LIWC features; then predict on same plus ISOT "title" data also.  





    

In [2]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from functools import reduce
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
#assert(tf.__version__.startswith("1.8"))

import pickle
import dill
# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
from w266_common import patched_numpy_io
import timeit  #For timing


In [3]:
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [4]:
print('TensorFlow version:', tf.VERSION)

TensorFlow version: 1.10.1


### Load LIAR/Politifact data and vocabulary from pickle files  
Load the COMBINED LIAR and Politifact dataset.


In [5]:
# Read LIAR/Politifact data from pickle file.
all_data = pd.read_pickle('parsed_data/df_liarpolitifact_data_embed.pkl')  # 

all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23258 entries, 0 to 23257
Data columns (total 6 columns):
target            23258 non-null int64
title             23258 non-null object
title_tokcan      23258 non-null object
title_POS         23258 non-null object
binary_target     23258 non-null int64
embedded_title    23258 non-null object
dtypes: int64(2), object(4)
memory usage: 19.1 MB


In [6]:
all_data.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,1,Says 31 percent of Texas physicians accept all...,"[says, <number>, percent, of, texas, physician...","[V, $, N, P, ^, N, V, D, A, ^, N, ,, R, P, $, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
1,2,''Both Democrats and Republicans are advocatin...,"['', both, democrats, and, republicans, are, a...","[,, D, N, &, N, V, V, P, D, N, P, N, N, V, P, ...",-1,"[[0.0028594, 0.19457, -0.19449, -0.037583, 0.9..."
2,0,A Republican-led softening of firearms trainin...,"[a, republican-led, softening, of, firearms, t...","[D, A, N, P, N, N, N, V, D, A, N, V, V, V, P, ...",1,"[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,..."
3,5,The first tweet was sent from Austin.,"[the, first, tweet, was, sent, from, austin, .]","[D, A, N, V, V, P, ^, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
4,2,Georgia has the countrys second highest number...,"[georgia, has, the, countrys, second, highest,...","[^, V, D, N, A, A, N, P, A, N, N, N, ,]",-1,"[[-1.3427, 0.4592, 0.19281, 0.71305, -0.5934, ..."


In [7]:
all_data.title[0]

'Says 31 percent of Texas physicians accept all new Medicaid patients, down from 67 percent in 2000.'

In [8]:
all_data.title_tokcan[0]

['says',
 '<number>',
 'percent',
 'of',
 'texas',
 'physicians',
 'accept',
 'all',
 'new',
 'medicaid',
 'patients',
 ',',
 'down',
 'from',
 '<number>',
 'percent',
 'in',
 '<number>',
 '.']

In [9]:
## remove duplicates
all_data = all_data[all_data.duplicated(['title' , 'target'])==False]
all_data = all_data.reset_index(drop =True)
all_data.tail(5)

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
17668,1,Says Republican lieutenants to House Speaker ...,"[says, republican, lieutenants, to, house, spe...","[V, A, N, P, ^, ^, ^, ^, V, N, N, V, A, N, ^, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
17669,2,"Texas state Reps. Dan Branch and Jim Pitts, li...","[texas, state, reps, ., dan, branch, and, jim,...","[^, N, N, ,, ^, ^, &, ^, ^, ,, N, P, ^, ^, ^, ...",-1,"[[-1.1389, 0.091966, 0.21446, 0.62652, -0.1462..."
17670,3,Powerful Houston Democrats Sylvia Garcia (Dem...,"[powerful, houston, democrats, sylvia, garcia,...","[A, ^, N, ^, ^, ,, ^, N, N, P, ^, ^, ,, &, ^, ...",0,"[[0.64857, -0.77899, 0.64121, 0.22266, 0.82489..."
17671,3,"Says Jeanne Shaheen ""got behind the idea of us...","["", says, jeanne, shaheen, """", got, behind, th...","[,, V, ^, ^, ,, V, P, D, N, P, V, D, ^, P, V, ...",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."
17672,3,"In Texas public schools, ""we spend an average ...","["", in, texas, public, schools, ,, """", we, spe...","[,, P, ^, A, N, ,, ,, O, V, D, N, P, G, $, P, ...",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."


In [10]:
print('target=1 (real):', len(all_data[all_data.binary_target == 1]))
print('target=0 (fake):', len(all_data[all_data.binary_target == 0]))
print('target=-1 (half true; DROP):', len(all_data[all_data.binary_target == -1]))

target=1 (real): 6020
target=0 (fake): 7977
target=-1 (half true; DROP): 3676


In [11]:
# Read LIAR/Politifact (lp) vocab from pickle file.

vocab = pd.read_pickle('parsed_data/vocab_lp.pkl')  # COMBINED LIAR and Politifact data (CMU) tokenized and POS tags added

In [12]:
print("{:,} words".format(vocab.size))  # Note: this combines words from ISOT "title" AND "text" fields!
print("wordset: ",vocab.ordered_words()[:30])
print(vocab)

14,460 words
wordset:  ['<s>', '</s>', '<unk>', '.', 'the', ',', 'in', 'of', 'to', '<number>', 'a', '""', 'and', '"', 'says', 'for', '"""', 'that', 'is', 'on', 'has', 'have', 'percent', 'than', 'are', 'more', '$<number>', 'was', 'we', 'by']
<w266_common.vocabulary2.Vocabulary2 object at 0x7fb91cd946d8>


### Load LIAR and ISOT LIWC features from pickle file 
### (NOTE: will need to get new LIWC files to match vocab size of 14,460 words)   

In [13]:
liwc_liar = pd.read_pickle('parsed_data/liwc_liarpolitifact.pkl')

In [14]:
liwc_liar.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,Unnamed: 74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print(liwc_liar.shape)

(14460, 74)


In [16]:
liwc_isot = pd.read_pickle('parsed_data/liwc_isot2.pkl')

In [17]:
liwc_isot.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,Unnamed: 74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
print(liwc_isot.values)
print(liwc_isot.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(152182, 74)


In [19]:
#print(liwc_isot.values[:10,:])

In [20]:
#liwc = tf.to_float(liwc_isot.values)
liwc = liwc_isot.astype('float32')
print(liwc.values)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [21]:
print(np.array(liwc))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Train / Dev / Test Split LIAR/Politifact data

#### First, drop the data having binary_target = -1  

In [22]:
all_data = all_data[all_data.binary_target >= 0]
print(all_data.shape)
print('target=1 (real):', len(all_data[all_data.binary_target == 1]))
print('target=0 (fake):', len(all_data[all_data.binary_target == 0]))
print('target=-1 (half true; DROP):', len(all_data[all_data.binary_target == -1]))


(13997, 6)
target=1 (real): 6020
target=0 (fake): 7977
target=-1 (half true; DROP): 0


In [23]:
#train/dev/train split
#train_dev_split = 0.8

train_fract = 0.70
dev_fract = 0.15
test_fract = 0.15

if (train_fract+dev_fract+test_fract) == 1.0:
    print('Split fractions add up to 1.0')
else:
    print('SPLIT FRACTIONS DO NOT ADD UP TO 1.0; PLEASE TRY AGAIN.............')

#train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
#dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

train_set = all_data[ :int(len(all_data)*train_fract)].reset_index(drop=True)
dev_set = all_data[int(len(all_data)*(train_fract)) : int(len(all_data)*(train_fract+dev_fract))].reset_index(drop=True)
test_set = all_data[int(len(all_data)*(train_fract+dev_fract)) : ].reset_index(drop=True)

print('training set: ',train_set.shape)
print('dev set: ',dev_set.shape)
print('test set: ',test_set.shape)

Split fractions add up to 1.0
training set:  (9797, 6)
dev set:  (2100, 6)
test set:  (2100, 6)


In [24]:
train_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,1,Says 31 percent of Texas physicians accept all...,"[says, <number>, percent, of, texas, physician...","[V, $, N, P, ^, N, V, D, A, ^, N, ,, R, P, $, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
1,0,A Republican-led softening of firearms trainin...,"[a, republican-led, softening, of, firearms, t...","[D, A, N, P, N, N, N, V, D, A, N, V, V, V, P, ...",1,"[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,..."
2,5,The first tweet was sent from Austin.,"[the, first, tweet, was, sent, from, austin, .]","[D, A, N, V, V, P, ^, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
3,1,Florida has reduced its carbon emissions by 20...,"[florida, has, reduced, its, carbon, emissions...","[^, V, V, L, N, N, P, $, N, P, $, ,]",1,"[[-0.52717, 0.16878, 0.16146, 0.93858, -0.6549..."
4,1,Mt. Hood Community College is No. 1 on average...,"[mt, ., hood, community, college, is, no, ., <...","[^, ,, N, N, N, V, !, ,, $, P, A, &, A, N, N, ...",1,"[[-0.055441, 2.3025, 0.98466, -0.020482, -0.26..."


In [25]:
dev_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,0,Says Jeb Bush -- not Charlie Crist -- signed l...,"[says, jeb, bush, --, not, charlie, crist, --,...","[V, ^, ^, ,, R, ^, ^, ,, V, N, P, V, ^, N, V, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
1,3,Estimates for adopting Obamacares Medicaid exp...,"[estimates, for, adopting, obamacares, medicai...","[N, P, V, ^, ^, N, N, O, V, V, ^, N, $, $, P, ...",0,"[[0.37704, -0.35551, 1.4605, -0.28685, 0.34969..."
2,3,"Just like Hillary Clinton, Russ Feingold had a...","[just, like, hillary, clinton, ,, russ, feingo...","[R, P, ^, ^, ,, ^, ^, V, D, N, P, N, &, N, P, ...",0,"[[0.17698, 0.065221, 0.28548, -0.4243, 0.7499,..."
3,0,Its been 17 years that weve had unemployment h...,"[its, been, <number>, years, that, weve, had, ...","[L, V, $, N, P, L, V, N, A, P, D, N, N, P, ^, ,]",1,"[[0.76719, 0.1239, -0.11119, 0.13355, 0.18356,..."
4,4,Of the roughly 15 percent of Americans who don...,"[of, the, roughly, <number>, percent, of, amer...","[P, D, R, $, N, P, ^, O, V, V, N, N, ,, N, P, ...",0,"[[0.70853, 0.57088, -0.4716, 0.18048, 0.54449,..."


In [26]:
# print out dev set
#dev_set.to_csv('isot_dev_set.csv', sep=',')

In [27]:
test_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,4,The economic impact of Atlanta's 2000 Super Bo...,"[the, economic, impact, of, atlanta's, <number...","[D, A, N, P, Z, $, ^, ^, V, $, N, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
1,4,"Health insurance companies deny ""1 out of 5 tr...","["", health, insurance, companies, deny, """", <n...","[,, N, N, N, V, ,, $, P, P, $, N, V, P, N, ,, ,]",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."
2,4,"Mitt Romney ""left Massachusetts $1 billion in ...","["", mitt, romney, """", left, massachusetts, $<n...","[,, ^, ^, ,, V, ^, $, $, P, N, ,, ,]",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."
3,3,Horse racing could boost Georgia's economy by ...,"[horse, racing, could, boost, georgia's, econo...","[N, N, V, V, Z, N, P, $, $, D, N, &, V, $, P, ...",0,"[[-0.20454, 0.23321, -0.59158, -0.29205, 0.293..."
4,0,"With only 67 bills or so passed into law, ""201...","["", with, only, <number>, bills, or, so, passe...","[,, P, A, $, N, &, R, V, P, N, ,, ,, $, V, D, ...",1,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."


### Select LIAR/Politifact features and (binary) labels for training model 

In [28]:
train_data, train_labels = train_set.title_tokcan.values, train_set.binary_target.values
dev_data, dev_labels = dev_set.title_tokcan.values, dev_set.binary_target.values
test_data, test_labels = test_set.title_tokcan.values, test_set.binary_target.values

train_labels = train_labels.astype(int)
dev_labels = dev_labels.astype(int)
test_labels = test_labels.astype(int)

#train_data.head()
print('train_data shape:', train_data.shape)
#print(train_data[0].shape)
print(train_data[:1])
print('train_labels shape:', train_labels.shape)
print(train_labels)
print()
print('dev_data shape:', dev_data.shape)
print(dev_data[:1])
print('dev_labels shape:', dev_labels.shape)
print(dev_labels)
print()
print('test_data shape:', test_data.shape)
print(test_data[:1])
print('test_labels shape:', test_labels.shape)
print(test_labels)


train_data shape: (9797,)
[list(['says', '<number>', 'percent', 'of', 'texas', 'physicians', 'accept', 'all', 'new', 'medicaid', 'patients', ',', 'down', 'from', '<number>', 'percent', 'in', '<number>', '.'])]
train_labels shape: (9797,)
[1 1 0 ... 0 0 0]

dev_data shape: (2100,)
[list(['says', 'jeb', 'bush', '--', 'not', 'charlie', 'crist', '--', 'signed', 'legislation', 'that', 'let', 'duke', 'energy', 'collect', 'money', 'for', 'nuclear', 'projects', '.'])]
dev_labels shape: (2100,)
[1 0 0 ... 0 0 0]

test_data shape: (2100,)
[list(['the', 'economic', 'impact', 'of', "atlanta's", '<number>', 'super', 'bowl', 'was', '$<number>', 'million', '.'])]
test_labels shape: (2100,)
[0 0 0 ... 0 0 0]


In [29]:
# characterize length of documents in train_data

lengths = [len(train_data[i]) for i in range(train_data.shape[0])]

a = np.array(lengths)
p = np.percentile(a, 95) # return 95th percentile
print('95th percentile:', p)

95th percentile: 36.0


In [30]:
# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

# Helper code for plotting histograms
def plot_length_histogram(lengths, x_range=[0,100], bins=40, normed=True):
    hist, bin_edges = np.histogram(a=lengths, bins=bins, normed=normed, range=x_range)
    bin_centers = (bin_edges[1:] + bin_edges[:-1])/2
    bin_widths =  (bin_edges[1:] - bin_edges[:-1])

    hover = HoverTool(tooltips=[("bucket", "@x"), ("count", "@top")], mode="vline")
    fig = bp.figure(plot_width=800, plot_height=400, tools=[hover])
    fig.vbar(x=bin_centers, width=bin_widths, top=hist, hover_fill_color="firebrick")
    fig.y_range.start = 0
    fig.x_range.start = 0
    fig.xaxis.axis_label = "Example length (number of tokens)"
    fig.yaxis.axis_label = "Frequency"
    bp.show(fig)

In [31]:
plot_length_histogram(lengths)

  


### Load ISOT data to evaluate various models below.  

In [32]:
# Read ISOT data from pickle file.
isot_data = pd.read_pickle('parsed_data/df_alldata2.pkl')  # ISOT data (CMU) tokenized and POS tags added

isot_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 9 columns):
title           44898 non-null object
text            44898 non-null object
subject         44898 non-null object
date            44898 non-null object
target          44898 non-null object
title_tokcan    44898 non-null object
title_POS       44898 non-null object
text_tokcan     44898 non-null object
text_POS        44898 non-null object
dtypes: object(9)
memory usage: 527.8 MB


In [33]:
isot_data.head(10)

Unnamed: 0,title,text,subject,date,target,title_tokcan,title_POS,text_tokcan,text_POS
0,BRAINIAC Gets Rejected After Trying To Buy BMW...,Does anyone else out there see a future BMW ca...,Government News,"Mar 20, 2016",0,"[brainiac<allcaps>, gets, rejected, after, try...","[N, V, V, P, V, P, V, ^, P, ^, ^, ,, O, V, A, ...","[does, anyone, else, out, there, see, a, futur...","[V, N, R, P, R, V, D, A, ^, N, N, P, D, N, ,, ..."
1,Windows 10 is Stealing Your Bandwidth (You Mig...,21st Century Wire says We ve heard a lot of no...,US_News,"April 7, 2016",0,"[windows, <number>, is, stealing, your, bandwi...","[^, $, V, V, D, N, ,, O, V, V, P, V, O, ,]","[<number>st, century, wire, says, we, ve, hear...","[A, N, ^, V, O, V, V, D, N, P, R, R, A, N, P, ..."
2,STUNNING STORY The Media And Democrats Hid Fro...,"In an email sent on April 15, 2011, our upstan...",left-news,"Mar 2, 2017",0,"[stunning<allcaps>, story<allcaps>, the, media...","[A, N, D, N, &, N, V, P, ^, ,, R, Z, ^, ^, N, ...","[in, an, email, sent, on, april, <number>, ,, ...","[P, D, N, V, P, ^, $, ,, $, ,, D, A, N, A, ^, ..."
3,North Korea's Kim Jong Un fetes nuclear scient...,SEOUL (Reuters) - North Korean leader Kim Jong...,worldnews,"September 10, 2017",1,"[north, korea's, kim, jong, un, fetes, nuclear...","[^, Z, ^, ^, ^, V, A, N, ,, V, N, N]","[seoul<allcaps>, (, reuters, ), -, north, kore...","[^, ,, ^, ,, ,, ^, ^, N, ^, ^, ^, V, D, A, N, ..."
4,White House developing comprehensive biosecuri...,"ASPEN, Colorado (Reuters) - The Trump administ...",politicsNews,"July 20, 2017",1,"[white, house, developing, comprehensive, bios...","[A, N, V, A, N, N, ,, A]","[aspen<allcaps>, ,, colorado, (, reuters, ), -...","[^, ,, ^, ,, ^, ,, ,, D, ^, N, V, V, D, A, A, ..."
5,LOL! GEORGE LOPEZ Booed Off Stage At Children’...,George Lopez was hired to be the emcee for the...,politics,"Oct 14, 2017",0,"[lol<allcaps>, !, george<allcaps>, lopez<allca...","[!, ,, ^, ^, V, P, N, P, ^, ^, N, P, N, V, D, ...","[george, lopez, was, hired, to, be, the, emcee...","[^, ^, V, V, P, V, D, N, P, D, ^, G, N, N, G, ..."
6,HILLARY CLINTON CRONYISM VIOLATES FEDERAL RULE...,Former Secretary of State Hillary Clinton soug...,politics,"Oct 6, 2016",0,"[hillary<allcaps>, clinton<allcaps>, cronyism<...","[^, ^, N, V, A, N, ,, Z, ,, A, N, ,, V, N, P, ...","[former, secretary, of, state, hillary, clinto...","[A, N, P, ^, ^, ^, V, P, V, ^, &, ^, ^, N, N, ..."
7,Republican Senator Alexander to consult on bip...,WASHINGTON (Reuters) - U.S. Republican Senator...,politicsNews,"September 26, 2017",1,"[republican, senator, alexander, to, consult, ...","[A, N, ^, P, V, P, A, N, N]","[washington<allcaps>, (, reuters, ), -, u.s., ...","[^, ,, ^, ,, ,, ^, ^, ^, ^, ^, V, ^, P, O, V, ..."
8,Kellyanne Conway Announces Trump’s HUGE ‘Than...,Kellyanne Conway accidentally announced exactl...,News,"January 9, 2017",0,"[kellyanne, conway, announces, trump’s, huge<a...","[^, ^, V, Z, A, ,, V, O, ,, N, P, ^, ,, &, L, ...","[kellyanne, conway, accidentally, announced, e...","[^, ^, R, V, R, R, ^, ^, V, P, V, ^, ^, P, D, ..."
9,"Zimbabwe's army seizes power, Mugabe confined ...",HARARE (Reuters) - Zimbabwe s military seized ...,worldnews,"November 15, 2017",1,"["", zimbabwe's, army, seizes, power, ,, mugabe...","[,, Z, N, N, N, ,, ^, V, &, ,, A, ,]","[harare<allcaps>, (, reuters, ), -, zimbabwe, ...","[^, ,, ^, ,, ,, ^, G, A, A, N, P, ^, V, O, V, ..."


In [34]:
isot_title_tokans = isot_data.title_tokcan.values
isot_labels = isot_data.target.values.astype(int)

print('isot titles:', isot_title_tokans)
print('isot labels:', isot_labels)

isot titles: [list(['brainiac<allcaps>', 'gets', 'rejected', 'after', 'trying', 'to', 'buy', 'bmw<allcaps>', 'with', 'ebt<allcaps>', 'card', '…', 'what', 'happens', 'next', 'is', 'hysterical<allcaps>', '!'])
 list(['windows', '<number>', 'is', 'stealing', 'your', 'bandwidth', '(', 'you', 'might', 'want', 'to', 'delete', 'it', ')'])
 list(['stunning<allcaps>', 'story<allcaps>', 'the', 'media', 'and', 'democrats', 'hid', 'from', 'public', ':', 'how', 'obama<allcaps>’s', 'ag<allcaps>', 'eric', 'holder', 'used', 'taxpayer<allcaps>', 'dollars', 'to', 'organize', 'street', 'mobs', 'against', 'george', 'zimmerman', ',', 'take', 'down', 'police', 'chief'])
 ...
 list(['(', 'video<allcaps>', ')', 'the<allcaps>', 'great<allcaps>', 'divider<allcaps>', ':', 'obama<allcaps>', 'pulls<allcaps>', 'out<allcaps>', 'the<allcaps>', 'straw<allcaps>', 'man<allcaps>', 'argument<allcaps>', 'at<allcaps>', 'the<allcaps>', 'poverty<allcaps>', 'summit<allcaps>'])
 list(['seasons<allcaps>', 'beatings<allcaps>', '!

In [35]:
print('ISOT size:', isot_title_tokans.shape)

ISOT size: (44898,)


## CNN: LIAR/Politifact data WITH GloVe embeddings with LIWC features

#### Include reference functions for viewing convenience

In [36]:
# May need this info (from utils.py)
'''
def build_vocab(corpus, V=10000, **kw):
    from . import vocabulary
    if isinstance(corpus, list):
        token_feed = (canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    else:
        token_feed = (canonicalize_word(w) for w in corpus.words())
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)

    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab

# Window and batch functions
def pad_np_array(example_ids, max_len=250, pad_id=0):
    """Pad a list of lists of ids into a rectangular NumPy array.

    Longer sequences will be truncated to max_len ids, while shorter ones will
    be padded with pad_id.

    Args:
        example_ids: list(list(int)), sequence of ids for each example
        max_len: maximum sequence length
        pad_id: id to pad shorter sequences with

    Returns: (x, ns)
        x: [num_examples, max_len] NumPy array of integer ids
        ns: [num_examples] NumPy array of sequence lengths (<= max_len)
    """
    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def id_lists_to_sparse_bow(id_lists, vocab_size):
    """Convert a list-of-lists-of-ids to a sparse bag-of-words matrix.

    Args:
        id_lists: (list(list(int))) list of lists of word ids
        vocab_size: (int) vocab size; must be greater than the largest word id
            in id_lists.

    Returns:
        (scipy.sparse.csr_matrix) where each row is a sparse vector of word
        counts for the corresponding example.
    """
    from scipy import sparse
    ii = []  # row indices (example ids)
    jj = []  # column indices (token ids)
    for row_id, ids in enumerate(id_lists):
        ii.extend([row_id]*len(ids))
        jj.extend(ids)
    x = sparse.csr_matrix((np.ones_like(ii), (ii, jj)),
                          shape=[len(id_lists), vocab_size])
    return x
'''

'\ndef build_vocab(corpus, V=10000, **kw):\n    from . import vocabulary\n    if isinstance(corpus, list):\n        token_feed = (canonicalize_word(w) for w in corpus)\n        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)\n    else:\n        token_feed = (canonicalize_word(w) for w in corpus.words())\n        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)\n\n    print("Vocabulary: {:,} types".format(vocab.size))\n    return vocab\n\n# Window and batch functions\ndef pad_np_array(example_ids, max_len=250, pad_id=0):\n    """Pad a list of lists of ids into a rectangular NumPy array.\n\n    Longer sequences will be truncated to max_len ids, while shorter ones will\n    be padded with pad_id.\n\n    Args:\n        example_ids: list(list(int)), sequence of ids for each example\n        max_len: maximum sequence length\n        pad_id: id to pad shorter sequences with\n\n    Returns: (x, ns)\n        x: [num_examples, max_len] NumPy array of integer ids\n        ns: [num_

In [37]:
# These are functions that were in the "SSTDataset" class in sst.py from A2
'''
def get_filtered_split(split='train', df_idxs=None, root_only=False):
    if not hasattr(split):
        raise ValueError("Invalid split name '%s'" % name)
    df = getattr(split)
    if df_idxs is not None:
        df = df.loc[df_idxs]
    #if root_only:          # Should not need in Final Project.
        #df = df[df.is_root]
    return df

def as_padded_array(split='train', max_len=40, pad_id=0,
                    root_only=False, df_idxs=None):
    """Return the dataset as a (padded) NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones
    will be padded with pad_id.
    Args:
      split: 'train' or 'test'
      max_len: maximum sequence length
      pad_id: id to pad shorter sequences with
      root_only: if true, will only export root phrases
      df_idxs: (optional) custom list of indices to export
    Returns: (x, ns, y)
      x: [num_examples, max_len] NumPy array of integer ids
      ns: [num_examples] NumPy array of sequence lengths (<= max_len)
      y: [num_examples] NumPy array of target ids
    """
    df = get_filtered_split(split, df_idxs, root_only)
    x, ns = utils.pad_np_array(df.ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(df.label, dtype=np.int32)

def as_sparse_bow(split='train', root_only=False, df_idxs=None):
    from scipy import sparse
    df = get_filtered_split(split, df_idxs, root_only)
    x = utils.id_lists_to_sparse_bow(df['ids'], self.vocab.size)
    y = np.array(df.label, dtype=np.int32)
    return x, y
'''

'\ndef get_filtered_split(split=\'train\', df_idxs=None, root_only=False):\n    if not hasattr(split):\n        raise ValueError("Invalid split name \'%s\'" % name)\n    df = getattr(split)\n    if df_idxs is not None:\n        df = df.loc[df_idxs]\n    #if root_only:          # Should not need in Final Project.\n        #df = df[df.is_root]\n    return df\n\ndef as_padded_array(split=\'train\', max_len=40, pad_id=0,\n                    root_only=False, df_idxs=None):\n    """Return the dataset as a (padded) NumPy array.\n    Longer sequences will be truncated to max_len ids, while shorter ones\n    will be padded with pad_id.\n    Args:\n      split: \'train\' or \'test\'\n      max_len: maximum sequence length\n      pad_id: id to pad shorter sequences with\n      root_only: if true, will only export root phrases\n      df_idxs: (optional) custom list of indices to export\n    Returns: (x, ns, y)\n      x: [num_examples, max_len] NumPy array of integer ids\n      ns: [num_examples] 

#### Construct train, dev, test data arrays  

In [38]:
## Training data

all_train_ids=[]
for i, tokens in enumerate(train_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_train_ids.append(sent_ids)
print(all_train_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
train_x, train_ns = utils.pad_np_array(all_train_ids, max_len=max_len)
print()
print(train_x[:2])
print()
print(train_ns[:2])

train_y = train_labels
print(train_y[:2])

[[14, 9, 22, 7, 73, 4446, 1590, 79, 60, 306, 1257, 5, 145, 34, 9, 22, 6, 9, 3], [10, 4447, 7509, 7, 1070, 1292, 668, 766, 17, 5483, 926, 50, 51, 489, 8, 896, 446, 35, 10, 32, 1552, 3], [4, 108, 7510, 27, 545, 34, 390, 3], [124, 20, 639, 110, 1214, 1648, 29, 9, 22, 90, 9, 3], [7512, 3, 2642, 701, 253, 18, 82, 3, 9, 19, 132, 12, 291, 2643, 102, 12, 328, 3]]

[[  14    9   22    7   73 4446 1590   79   60  306 1257    5  145   34
     9   22    6    9    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  10 4447 7509    7 1070 1292  668  766   17 5483  926   50   51  489
     8  896  446   35   10   32 1552    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]

[19 22]
[1 1]


In [39]:
## Dev data

all_dev_ids=[]
for i, tokens in enumerate(dev_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_dev_ids.append(sent_ids)
print(all_dev_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
dev_x, dev_ns = utils.pad_np_array(all_dev_ids, max_len=max_len)
print()
print(dev_x[:2])
print()
print(dev_ns[:2])

dev_y = dev_labels
print(dev_y[:2])

[[14, 1013, 173, 118, 39, 534, 562, 118, 484, 288, 17, 582, 4474, 337, 1936, 103, 15, 431, 755, 3], [1141, 15, 5110, 7497, 306, 782, 557, 33, 58, 157, 339, 336, 26, 52, 258, 9, 3], [123, 207, 206, 143, 5, 1365, 1316, 93, 10, 14252, 35, 2819, 12, 2496, 29, 690, 10, 812, 3417, 281, 118, 14253, 4, 1316, 1215, 118, 156, 7283, 581, 493, 15, 70, 121, 3], [110, 65, 9, 42, 17, 315, 93, 152, 243, 23, 4, 138, 132, 6, 355, 3], [7, 4, 762, 9, 22, 7, 116, 68, 226, 21, 40, 119, 5, 188, 7, 158, 238, 25, 23, 85, 9, 10, 46, 3]]

[[  14 1013  173  118   39  534  562  118  484  288   17  582 4474  337
  1936  103   15  431  755    3    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [1141   15 5110 7497  306  782  557   33   58  157  339  336   26   52
   258    9    3    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]

[20 17]
[1 0]


In [40]:
## Test data

all_test_ids=[]
for i, tokens in enumerate(test_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_test_ids.append(sent_ids)
print(all_test_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
test_x, test_ns = utils.pad_np_array(all_test_ids, max_len=max_len)
print()
print(test_x[:2])
print()
print(test_ns[:2])

test_y = test_labels
print(test_y[:2])

[[4, 293, 944, 7, 12775, 9, 1254, 1857, 27, 26, 52, 3], [13, 40, 119, 263, 1169, 11, 9, 81, 7, 9, 2441, 4252, 29, 823, 3, 16], [13, 213, 184, 11, 403, 412, 26, 75, 6, 128, 3, 16], [2978, 4564, 162, 7015, 4191, 222, 29, 26, 75, 10, 46, 12, 386, 9, 8, 9, 53, 3], [13, 35, 83, 9, 437, 67, 198, 231, 115, 97, 5, 11, 9, 27, 4, 387, 5419, 46, 6, 383, 189, 90, 315, 65, 2113, 300, 3, 16]]

[[    4   293   944     7 12775     9  1254  1857    27    26    52     3
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [   13    40   119   263  1169    11     9    81     7     9  2441  4252
     29   823     3    16     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]

[12 16]
[0 0]


In [41]:
print("Examples:\n", train_x[:3])
print("Original sequence lengths: ", train_ns[:3])
print("Target labels: ", train_y[:3])
print("")
print("Padded:\n", " ".join(vocab.ids_to_words(train_x[0])))
print("Un-padded:\n", " ".join(vocab.ids_to_words(train_x[0,:train_ns[0]])))

Examples:
 [[  14    9   22    7   73 4446 1590   79   60  306 1257    5  145   34
     9   22    6    9    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  10 4447 7509    7 1070 1292  668  766   17 5483  926   50   51  489
     8  896  446   35   10   32 1552    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   4  108 7510   27  545   34  390    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]
Original sequence lengths:  [19 22  8]
Target labels:  [1 1 0]

Padded:
 says <number> percent of texas physicians accept all new medicaid patients , down from <number> percent in <number> . <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s>
Un-padded:
 says <number> percent of texas physicians accept all new medicaid patients , dow

### Use tf.Estimator API along with nbow_models_x.py 

#### Things to consider:  
- Start w/ 2 epochs (20 was original)       
- Consider use of dropouts in fully-connected layers     
-  Use embed_dim = 300 rather than 50??   
- xx  
...  


In [42]:
print('vocab size:', vocab.size)

vocab size: 14460


In [91]:
## Setup model framework
## (Must specify correct nbow_model_x name in this cell to use the correct nbow_model_x.py file.)

import cnn_models_2; reload(cnn_models_2)

# Specify model hyperparameters as used by model_fn.  Use embed_dim2=74 for all LIWC
### ADD NEW PARAMETER: liwc_dim???

model_params = dict(V=vocab.size, embed_dim=300, filters=100, kernel_sizes=[2,4,7], 
                    hidden_dims=[25,75], num_classes=2, encoder_type='cnn', input_data = 'liar', 
                    dropout_rate=0.5, lr=0.1, optimizer='adagrad', beta=0.01)  # can set optimizer to 'adagrad' or 'adam', which is slower here

checkpoint_dir = "/tmp/tf_cnn_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")
vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=cnn_models_2.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (14,460 words) written to '/tmp/tf_cnn_20181209-0550/metadata.tsv'
Projector config written to /tmp/tf_cnn_20181209-0550/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_cnn_20181209-0550', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb8828c2a20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_cnn_20181209-0550' --port 6006

Then in your browser, open: http://localhost:600

In [92]:
## Train model and Evaluate on Dev data

# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=10, eval_every=1) # start with 2 epochs rather than 20; eval_every=1 (was 2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns}, y=dev_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tf_cnn_20181209-0550/model.ckpt.
INFO:tensorflow:loss = 116.93371, step = 1
INFO:tensorflow:global_step/sec: 8.23974
INFO:tensorflow:loss = 63.3027, step = 101 (12.138 sec)
INFO:tensorflow:global_step/sec: 6.75318
INFO:tensorflow:loss = 56.32376, step = 201 (14.810 sec)
INFO:tensorflow:global_step/sec: 6.82484
INFO:tensorflow:loss = 51.992393, step = 301 (14.650 sec)
INFO:tensorflow:Saving checkpoints for 307 into /tmp/tf_cnn_20181209-0550/model.ckpt.
INFO:tensorflow:Loss for final step: 8.551152.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-09-05:51:40
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters fro

INFO:tensorflow:loss = 25.54451, step = 1836 (7.562 sec)
INFO:tensorflow:Saving checkpoints for 1842 into /tmp/tf_cnn_20181209-0550/model.ckpt.
INFO:tensorflow:Loss for final step: 4.3430767.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-09-05:54:34
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_cnn_20181209-0550/model.ckpt-1842
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-09-05:54:35
INFO:tensorflow:Saving dict for global step 1842: accuracy = 0.6566667, cross_entropy_loss = 0.6073793, global_step = 1842, loss = 103.32674
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1842: /tmp/tf_cnn_20181209-0550/model.ckpt-1842
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:

In [93]:
## Evaluate model on (ISOT) Test data

test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")

print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-09-05:56:52
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_cnn_20181209-0550/model.ckpt-3070
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-09-05:56:53
INFO:tensorflow:Saving dict for global step 3070: accuracy = 0.72333336, cross_entropy_loss = 0.551084, global_step = 3070, loss = 81.44126
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3070: /tmp/tf_cnn_20181209-0550/model.ckpt-3070
Accuracy on test set: 72.33%


{'accuracy': 0.72333336,
 'cross_entropy_loss': 0.551084,
 'loss': 81.44126,
 'global_step': 3070}

In [94]:
## We can also evaluate the old-fashioned way, by calling model.predict(...) and working with the predicted labels directly:

from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_cnn_20181209-0550/model.ckpt-3070
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Accuracy on test set: 72.33%


##### Accuracy of 70% is better than the baseline NB result of 62%.  

### Create padded ISOT data and apply prediction function to ISOT data.  



In [95]:
## ISOT data padding

all_isot_ids=[]
for i, tokens in enumerate(isot_title_tokans):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_isot_ids.append(sent_ids)
print(all_isot_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
isot_x, isot_ns = utils.pad_np_array(all_isot_ids, max_len=max_len)
print()
print(isot_x[:2])
print()
print(isot_ns[:2])

isot_y = isot_labels
print(isot_y[:2])

[[2, 831, 2434, 183, 618, 8, 382, 2, 35, 2, 1234, 2, 165, 2859, 363, 18, 2, 425], [6364, 9, 18, 8701, 177, 2, 30, 61, 1364, 329, 8, 14235, 33, 31], [2, 2, 4, 977, 12, 169, 8032, 34, 117, 325, 421, 2, 2, 1120, 3443, 294, 2, 155, 8, 8419, 469, 2, 129, 289, 8421, 5, 254, 145, 374, 1433], [591, 2, 4383, 2, 12000, 2, 431, 1658, 5, 3299, 2, 2], [282, 136, 3248, 3308, 2, 3606, 325, 1466]]

[[    2   831  2434   183   618     8   382     2    35     2  1234     2
    165  2859   363    18     2   425     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [ 6364     9    18  8701   177     2    30    61  1364   329     8 14235
     33    31     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]

[18 14]
[0 0]


In [96]:
## Evaluate model on LIAR data


test_input_fn_isot = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": isot_x, "ns": isot_ns}, y=isot_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn_isot, name="test")

print("Accuracy on ISOT set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-09-05:56:57
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_cnn_20181209-0550/model.ckpt-3070
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-09-05:57:19
INFO:tensorflow:Saving dict for global step 3070: accuracy = 0.5747027, cross_entropy_loss = 0.7497699, global_step = 3070, loss = 166.81721
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3070: /tmp/tf_cnn_20181209-0550/model.ckpt-3070
Accuracy on ISOT set: 57.47%


{'accuracy': 0.5747027,
 'cross_entropy_loss': 0.7497699,
 'loss': 166.81721,
 'global_step': 3070}

#### Prediction accuracy of 55% for ISOT data.  

In [101]:
test_x[y_pred != test_y]

array([[  13,   40,  119, ...,    0,    0,    0],
       [  13,   35,   83, ...,    0,    0,    0],
       [ 532,  500,   25, ...,    0,    0,    0],
       ...,
       [  13,   14, 2515, ...,    0,    0,    0],
       [  13,    4,    9, ...,    0,    0,    0],
       [  13, 2594, 5723, ...,    0,    0,    0]], dtype=int32)