# LIAR DETECTION GROUP PROJECT - Neural BOW Models  


### CONTENTS  

Imports  
Load ISOT data from appropriate pickle file  
Load ISOT vocabulary from pickle file  (note: vocab contains both "title" and "text" words)  
Train/Dev/Test split ISOT data  
Load LIWC data for custom features  
Load LIAR data (for evaluating models)  

#### Neural BOW Models:
- Model_1: Initial run replicating settings from Assignment 2, but with ISOT "title" data.  
- Model_2: Use GloVe word embeddings rather than initializing embeddings with uniform random numbers.  
- Model_3: Random word embeddings, but custom LIWC features concatenated into the model. 
- Model_4: Incorporate GloVe embeddings as well as LIWC features. Still training with ISOT "title" data. 
- Model_5: Train using LIAR/Politifact data, GloVe embeddings, but NO LIWC features; then predict on same plus ISOT "title" data also.  
- Model_6: Train using LIAR/Politifact data, GloVe embeddings, and LIWC features; then predict on same plus ISOT "title" data also.  





    

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from functools import reduce
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
#assert(tf.__version__.startswith("1.8"))

import pickle
import dill
# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
from w266_common import patched_numpy_io
import timeit  #For timing


In [2]:
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
print('TensorFlow version:', tf.VERSION)

TensorFlow version: 1.10.1


### Load LIAR/Politifact data and vocabulary from pickle files  
Load the COMBINED LIAR and Politifact dataset.


In [5]:
# Read LIAR/Politifact data from pickle file.
all_data = pd.read_pickle('parsed_data/df_liarpolitifact_data_embed.pkl')  # 

all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23258 entries, 0 to 23257
Data columns (total 6 columns):
target            23258 non-null int64
title             23258 non-null object
title_tokcan      23258 non-null object
title_POS         23258 non-null object
binary_target     23258 non-null int64
embedded_title    23258 non-null object
dtypes: int64(2), object(4)
memory usage: 19.1 MB


In [6]:
all_data.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,1,Says 31 percent of Texas physicians accept all...,"[says, <number>, percent, of, texas, physician...","[V, $, N, P, ^, N, V, D, A, ^, N, ,, R, P, $, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
1,2,''Both Democrats and Republicans are advocatin...,"['', both, democrats, and, republicans, are, a...","[,, D, N, &, N, V, V, P, D, N, P, N, N, V, P, ...",-1,"[[0.0028594, 0.19457, -0.19449, -0.037583, 0.9..."
2,0,A Republican-led softening of firearms trainin...,"[a, republican-led, softening, of, firearms, t...","[D, A, N, P, N, N, N, V, D, A, N, V, V, V, P, ...",1,"[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,..."
3,5,The first tweet was sent from Austin.,"[the, first, tweet, was, sent, from, austin, .]","[D, A, N, V, V, P, ^, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
4,2,Georgia has the countrys second highest number...,"[georgia, has, the, countrys, second, highest,...","[^, V, D, N, A, A, N, P, A, N, N, N, ,]",-1,"[[-1.3427, 0.4592, 0.19281, 0.71305, -0.5934, ..."


In [7]:
all_data.title[0]

'Says 31 percent of Texas physicians accept all new Medicaid patients, down from 67 percent in 2000.'

In [8]:
all_data.title_tokcan[0]

['says',
 '<number>',
 'percent',
 'of',
 'texas',
 'physicians',
 'accept',
 'all',
 'new',
 'medicaid',
 'patients',
 ',',
 'down',
 'from',
 '<number>',
 'percent',
 'in',
 '<number>',
 '.']

In [9]:
print('target=1 (real):', len(all_data[all_data.binary_target == 1]))
print('target=0 (fake):', len(all_data[all_data.binary_target == 0]))
print('target=-1 (half true; DROP):', len(all_data[all_data.binary_target == -1]))

target=1 (real): 8283
target=0 (fake): 10199
target=-1 (half true; DROP): 4776


In [10]:
# Read LIAR/Politifact (lp) vocab from pickle file.

vocab = pd.read_pickle('parsed_data/vocab_lp.pkl')  # COMBINED LIAR and Politifact data (CMU) tokenized and POS tags added

In [11]:
print("{:,} words".format(vocab.size))  # Note: this combines words from ISOT "title" AND "text" fields!
print("wordset: ",vocab.ordered_words()[:30])
print(vocab)

14,460 words
wordset:  ['<s>', '</s>', '<unk>', '.', 'the', ',', 'in', 'of', 'to', '<number>', 'a', '""', 'and', '"', 'says', 'for', '"""', 'that', 'is', 'on', 'has', 'have', 'percent', 'than', 'are', 'more', '$<number>', 'was', 'we', 'by']
<w266_common.vocabulary2.Vocabulary2 object at 0x7fdd5e578c88>


### Load LIAR and ISOT LIWC features from pickle file 
### (NOTE: will need to get new LIWC files to match vocab size of 14,460 words)   

In [12]:
liwc_liar = pd.read_pickle('parsed_data/liwc_liar2.pkl')

In [13]:
liwc_liar.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,Unnamed: 74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
print(liwc_liar.shape)

(152036, 74)


In [15]:
liwc_isot = pd.read_pickle('parsed_data/liwc_isot2.pkl')

In [16]:
liwc_isot.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,Unnamed: 74
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print(liwc_isot.values)
print(liwc_isot.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(152182, 74)


In [23]:
#print(liwc_isot.values[:10,:])

In [17]:
#liwc = tf.to_float(liwc_isot.values)
liwc = liwc_isot.astype('float32')
print(liwc.values)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
print(np.array(liwc))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Train / Dev / Test Split LIAR/Politifact data

#### First, drop the data having binary_target = -1  

In [17]:
all_data = all_data[all_data.binary_target >= 0]
print(all_data.shape)
print('target=1 (real):', len(all_data[all_data.binary_target == 1]))
print('target=0 (fake):', len(all_data[all_data.binary_target == 0]))
print('target=-1 (half true; DROP):', len(all_data[all_data.binary_target == -1]))


(18482, 6)
target=1 (real): 8283
target=0 (fake): 10199
target=-1 (half true; DROP): 0


In [18]:
#train/dev/train split
#train_dev_split = 0.8

train_fract = 0.70
dev_fract = 0.15
test_fract = 0.15

if (train_fract+dev_fract+test_fract) == 1.0:
    print('Split fractions add up to 1.0')
else:
    print('SPLIT FRACTIONS DO NOT ADD UP TO 1.0; PLEASE TRY AGAIN.............')

#train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
#dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

train_set = all_data[ :int(len(all_data)*train_fract)].reset_index(drop=True)
dev_set = all_data[int(len(all_data)*(train_fract)) : int(len(all_data)*(train_fract+dev_fract))].reset_index(drop=True)
test_set = all_data[int(len(all_data)*(train_fract+dev_fract)) : ].reset_index(drop=True)

print('training set: ',train_set.shape)
print('dev set: ',dev_set.shape)
print('test set: ',test_set.shape)

Split fractions add up to 1.0
training set:  (12937, 6)
dev set:  (2772, 6)
test set:  (2773, 6)


In [19]:
train_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,1,Says 31 percent of Texas physicians accept all...,"[says, <number>, percent, of, texas, physician...","[V, $, N, P, ^, N, V, D, A, ^, N, ,, R, P, $, ...",1,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
1,0,A Republican-led softening of firearms trainin...,"[a, republican-led, softening, of, firearms, t...","[D, A, N, P, N, N, N, V, D, A, N, V, V, V, P, ...",1,"[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,..."
2,5,The first tweet was sent from Austin.,"[the, first, tweet, was, sent, from, austin, .]","[D, A, N, V, V, P, ^, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
3,1,Florida has reduced its carbon emissions by 20...,"[florida, has, reduced, its, carbon, emissions...","[^, V, V, L, N, N, P, $, N, P, $, ,]",1,"[[-0.52717, 0.16878, 0.16146, 0.93858, -0.6549..."
4,1,Mt. Hood Community College is No. 1 on average...,"[mt, ., hood, community, college, is, no, ., <...","[^, ,, N, N, N, V, !, ,, $, P, A, &, A, N, N, ...",1,"[[-0.055441, 2.3025, 0.98466, -0.020482, -0.26..."


In [20]:
dev_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,3,"Under Obamacare, people who ""have a doctor the...","["", under, obamacare, ,, people, who, """", have...","[,, P, ^, ,, N, O, ,, V, D, N, L, V, V, P, D, ...",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."
1,3,Even the employees union for the IRS -- the ve...,"[even, the, employees, union, for, the, irs<al...","[R, D, N, N, P, D, ^, ,, D, A, N, P, N, P, V, ...",0,"[[0.38336, -0.095871, 0.12229, -0.51625, 0.349..."
2,3,"Ch Guevara ""wrote extensively about the superi...","["", ch, guevara, """", wrote, extensively, about...","[,, ^, ^, ,, V, R, P, D, N, P, A, N, P, N, P, ...",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."
3,3,The Chinese government provides their people n...,"[the, chinese, government, provides, their, pe...","[D, A, N, V, D, N, D, N, P, D, N, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
4,3,Agriculture has always required a significant ...,"[agriculture, has, always, required, a, signif...","[N, V, R, V, D, A, N, P, R, ,]",0,"[[-0.43918, -0.47441, -0.78644, 0.18404, -0.11..."


In [21]:
# print out dev set
#dev_set.to_csv('isot_dev_set.csv', sep=',')

In [22]:
test_set.head()

Unnamed: 0,target,title,title_tokcan,title_POS,binary_target,embedded_title
0,1,"We've lost over 5,000 Americans over there in ...","[we've, lost, over, <number>, americans, over,...","[L, V, P, $, N, P, R, P, ^, ,, P, ^, ,, &, V, ...",1,"[[-0.79149, 0.86617, 0.11998, 0.00092287, 0.27..."
1,3,Says he has nearly 200 delegates bound to supp...,"[says, he, has, nearly, <number>, delegates, b...","[V, O, V, R, $, N, V, P, V, D, N, P, D, ^, ^, ...",0,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
2,3,The Fed created $15 trillion in the bailout pr...,"[the, fed, created, $<number>, trillion, in, t...","[D, A, V, $, $, P, D, N, N, &, $, $, V, A, ,]",0,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
3,3,Says the Department of Defense changed its def...,"[says, the, department, of, defense, changed, ...","[V, D, N, P, N, V, D, N, P, ^, &, D, ^, V, O, ...",0,"[[0.11797, 0.21126, 0.29075, -0.021211, 0.7819..."
4,3,"Says Rick Perry wrote a letter ""supporting Hil...","["", says, rick, perry, wrote, a, letter, """", s...","[,, V, ^, ^, V, D, N, ,, V, ^, ,, ,]",0,"[[0.25769, 0.45629, -0.76974, -0.37679, 0.5927..."


### Select LIAR/Politifact features and (binary) labels for training model 

In [23]:
train_data, train_labels = train_set.title_tokcan.values, train_set.binary_target.values
dev_data, dev_labels = dev_set.title_tokcan.values, dev_set.binary_target.values
test_data, test_labels = test_set.title_tokcan.values, test_set.binary_target.values

train_labels = train_labels.astype(int)
dev_labels = dev_labels.astype(int)
test_labels = test_labels.astype(int)

#train_data.head()
print('train_data shape:', train_data.shape)
#print(train_data[0].shape)
print(train_data[:1])
print('train_labels shape:', train_labels.shape)
print(train_labels)
print()
print('dev_data shape:', dev_data.shape)
print(dev_data[:1])
print('dev_labels shape:', dev_labels.shape)
print(dev_labels)
print()
print('test_data shape:', test_data.shape)
print(test_data[:1])
print('test_labels shape:', test_labels.shape)
print(test_labels)


train_data shape: (12937,)
[list(['says', '<number>', 'percent', 'of', 'texas', 'physicians', 'accept', 'all', 'new', 'medicaid', 'patients', ',', 'down', 'from', '<number>', 'percent', 'in', '<number>', '.'])]
train_labels shape: (12937,)
[1 1 0 ... 0 0 0]

dev_data shape: (2772,)
[list(['"', 'under', 'obamacare', ',', 'people', 'who', '""', 'have', 'a', 'doctor', 'theyve', 'been', 'seeing', 'for', 'the', 'last', '<number>', 'or', '<number>', 'years', ',', 'they', 'wont', 'be', 'able', 'to', 'keep', 'going', 'to', 'that', 'doctor', '.', '"""'])]
dev_labels shape: (2772,)
[0 0 0 ... 1 1 1]

test_data shape: (2773,)
[list(["we've", 'lost', 'over', '<number>', 'americans', 'over', 'there', 'in', 'afghanistan', ',', 'in', 'iraq', ',', 'and', 'plus', 'the', 'civilians', 'killed', '.'])]
test_labels shape: (2773,)
[1 0 0 ... 0 0 0]


In [24]:
# characterize length of documents in train_data

lengths = [len(train_data[i]) for i in range(train_data.shape[0])]

a = np.array(lengths)
p = np.percentile(a, 95) # return 95th percentile
print('95th percentile:', p)

95th percentile: 36.0


In [25]:
# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

# Helper code for plotting histograms
def plot_length_histogram(lengths, x_range=[0,100], bins=40, normed=True):
    hist, bin_edges = np.histogram(a=lengths, bins=bins, normed=normed, range=x_range)
    bin_centers = (bin_edges[1:] + bin_edges[:-1])/2
    bin_widths =  (bin_edges[1:] - bin_edges[:-1])

    hover = HoverTool(tooltips=[("bucket", "@x"), ("count", "@top")], mode="vline")
    fig = bp.figure(plot_width=800, plot_height=400, tools=[hover])
    fig.vbar(x=bin_centers, width=bin_widths, top=hist, hover_fill_color="firebrick")
    fig.y_range.start = 0
    fig.x_range.start = 0
    fig.xaxis.axis_label = "Example length (number of tokens)"
    fig.yaxis.axis_label = "Frequency"
    bp.show(fig)

In [26]:
plot_length_histogram(lengths)

  


### Load ISOT data to evaluate various models below.  

In [27]:
# Read ISOT data from pickle file.
isot_data = pd.read_pickle('parsed_data/df_alldata2.pkl')  # ISOT data (CMU) tokenized and POS tags added

isot_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 9 columns):
title           44898 non-null object
text            44898 non-null object
subject         44898 non-null object
date            44898 non-null object
target          44898 non-null object
title_tokcan    44898 non-null object
title_POS       44898 non-null object
text_tokcan     44898 non-null object
text_POS        44898 non-null object
dtypes: object(9)
memory usage: 527.8 MB


In [28]:
isot_data.head(10)

Unnamed: 0,title,text,subject,date,target,title_tokcan,title_POS,text_tokcan,text_POS
0,BRAINIAC Gets Rejected After Trying To Buy BMW...,Does anyone else out there see a future BMW ca...,Government News,"Mar 20, 2016",0,"[brainiac<allcaps>, gets, rejected, after, try...","[N, V, V, P, V, P, V, ^, P, ^, ^, ,, O, V, A, ...","[does, anyone, else, out, there, see, a, futur...","[V, N, R, P, R, V, D, A, ^, N, N, P, D, N, ,, ..."
1,Windows 10 is Stealing Your Bandwidth (You Mig...,21st Century Wire says We ve heard a lot of no...,US_News,"April 7, 2016",0,"[windows, <number>, is, stealing, your, bandwi...","[^, $, V, V, D, N, ,, O, V, V, P, V, O, ,]","[<number>st, century, wire, says, we, ve, hear...","[A, N, ^, V, O, V, V, D, N, P, R, R, A, N, P, ..."
2,STUNNING STORY The Media And Democrats Hid Fro...,"In an email sent on April 15, 2011, our upstan...",left-news,"Mar 2, 2017",0,"[stunning<allcaps>, story<allcaps>, the, media...","[A, N, D, N, &, N, V, P, ^, ,, R, Z, ^, ^, N, ...","[in, an, email, sent, on, april, <number>, ,, ...","[P, D, N, V, P, ^, $, ,, $, ,, D, A, N, A, ^, ..."
3,North Korea's Kim Jong Un fetes nuclear scient...,SEOUL (Reuters) - North Korean leader Kim Jong...,worldnews,"September 10, 2017",1,"[north, korea's, kim, jong, un, fetes, nuclear...","[^, Z, ^, ^, ^, V, A, N, ,, V, N, N]","[seoul<allcaps>, (, reuters, ), -, north, kore...","[^, ,, ^, ,, ,, ^, ^, N, ^, ^, ^, V, D, A, N, ..."
4,White House developing comprehensive biosecuri...,"ASPEN, Colorado (Reuters) - The Trump administ...",politicsNews,"July 20, 2017",1,"[white, house, developing, comprehensive, bios...","[A, N, V, A, N, N, ,, A]","[aspen<allcaps>, ,, colorado, (, reuters, ), -...","[^, ,, ^, ,, ^, ,, ,, D, ^, N, V, V, D, A, A, ..."
5,LOL! GEORGE LOPEZ Booed Off Stage At Children’...,George Lopez was hired to be the emcee for the...,politics,"Oct 14, 2017",0,"[lol<allcaps>, !, george<allcaps>, lopez<allca...","[!, ,, ^, ^, V, P, N, P, ^, ^, N, P, N, V, D, ...","[george, lopez, was, hired, to, be, the, emcee...","[^, ^, V, V, P, V, D, N, P, D, ^, G, N, N, G, ..."
6,HILLARY CLINTON CRONYISM VIOLATES FEDERAL RULE...,Former Secretary of State Hillary Clinton soug...,politics,"Oct 6, 2016",0,"[hillary<allcaps>, clinton<allcaps>, cronyism<...","[^, ^, N, V, A, N, ,, Z, ,, A, N, ,, V, N, P, ...","[former, secretary, of, state, hillary, clinto...","[A, N, P, ^, ^, ^, V, P, V, ^, &, ^, ^, N, N, ..."
7,Republican Senator Alexander to consult on bip...,WASHINGTON (Reuters) - U.S. Republican Senator...,politicsNews,"September 26, 2017",1,"[republican, senator, alexander, to, consult, ...","[A, N, ^, P, V, P, A, N, N]","[washington<allcaps>, (, reuters, ), -, u.s., ...","[^, ,, ^, ,, ,, ^, ^, ^, ^, ^, V, ^, P, O, V, ..."
8,Kellyanne Conway Announces Trump’s HUGE ‘Than...,Kellyanne Conway accidentally announced exactl...,News,"January 9, 2017",0,"[kellyanne, conway, announces, trump’s, huge<a...","[^, ^, V, Z, A, ,, V, O, ,, N, P, ^, ,, &, L, ...","[kellyanne, conway, accidentally, announced, e...","[^, ^, R, V, R, R, ^, ^, V, P, V, ^, ^, P, D, ..."
9,"Zimbabwe's army seizes power, Mugabe confined ...",HARARE (Reuters) - Zimbabwe s military seized ...,worldnews,"November 15, 2017",1,"["", zimbabwe's, army, seizes, power, ,, mugabe...","[,, Z, N, N, N, ,, ^, V, &, ,, A, ,]","[harare<allcaps>, (, reuters, ), -, zimbabwe, ...","[^, ,, ^, ,, ,, ^, G, A, A, N, P, ^, V, O, V, ..."


In [51]:
isot_title_tokans = isot_data.title_tokcan.values
isot_labels = isot_data.target.values.astype(int)

print('isot titles:', isot_title_tokans)
print('isot labels:', isot_labels)

isot titles: [list(['brainiac<allcaps>', 'gets', 'rejected', 'after', 'trying', 'to', 'buy', 'bmw<allcaps>', 'with', 'ebt<allcaps>', 'card', '…', 'what', 'happens', 'next', 'is', 'hysterical<allcaps>', '!'])
 list(['windows', '<number>', 'is', 'stealing', 'your', 'bandwidth', '(', 'you', 'might', 'want', 'to', 'delete', 'it', ')'])
 list(['stunning<allcaps>', 'story<allcaps>', 'the', 'media', 'and', 'democrats', 'hid', 'from', 'public', ':', 'how', 'obama<allcaps>’s', 'ag<allcaps>', 'eric', 'holder', 'used', 'taxpayer<allcaps>', 'dollars', 'to', 'organize', 'street', 'mobs', 'against', 'george', 'zimmerman', ',', 'take', 'down', 'police', 'chief'])
 ...
 list(['(', 'video<allcaps>', ')', 'the<allcaps>', 'great<allcaps>', 'divider<allcaps>', ':', 'obama<allcaps>', 'pulls<allcaps>', 'out<allcaps>', 'the<allcaps>', 'straw<allcaps>', 'man<allcaps>', 'argument<allcaps>', 'at<allcaps>', 'the<allcaps>', 'poverty<allcaps>', 'summit<allcaps>'])
 list(['seasons<allcaps>', 'beatings<allcaps>', '!

## Neural BOW Model 5: LIAR/Politifact data WITH GloVe embeddings but NO LIWC features

#### Include reference functions for viewing convenience

In [None]:
# May need this info (from utils.py)
'''
def build_vocab(corpus, V=10000, **kw):
    from . import vocabulary
    if isinstance(corpus, list):
        token_feed = (canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    else:
        token_feed = (canonicalize_word(w) for w in corpus.words())
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)

    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab

# Window and batch functions
def pad_np_array(example_ids, max_len=250, pad_id=0):
    """Pad a list of lists of ids into a rectangular NumPy array.

    Longer sequences will be truncated to max_len ids, while shorter ones will
    be padded with pad_id.

    Args:
        example_ids: list(list(int)), sequence of ids for each example
        max_len: maximum sequence length
        pad_id: id to pad shorter sequences with

    Returns: (x, ns)
        x: [num_examples, max_len] NumPy array of integer ids
        ns: [num_examples] NumPy array of sequence lengths (<= max_len)
    """
    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def id_lists_to_sparse_bow(id_lists, vocab_size):
    """Convert a list-of-lists-of-ids to a sparse bag-of-words matrix.

    Args:
        id_lists: (list(list(int))) list of lists of word ids
        vocab_size: (int) vocab size; must be greater than the largest word id
            in id_lists.

    Returns:
        (scipy.sparse.csr_matrix) where each row is a sparse vector of word
        counts for the corresponding example.
    """
    from scipy import sparse
    ii = []  # row indices (example ids)
    jj = []  # column indices (token ids)
    for row_id, ids in enumerate(id_lists):
        ii.extend([row_id]*len(ids))
        jj.extend(ids)
    x = sparse.csr_matrix((np.ones_like(ii), (ii, jj)),
                          shape=[len(id_lists), vocab_size])
    return x
'''

In [None]:
# These are functions that were in the "SSTDataset" class in sst.py from A2
'''
def get_filtered_split(split='train', df_idxs=None, root_only=False):
    if not hasattr(split):
        raise ValueError("Invalid split name '%s'" % name)
    df = getattr(split)
    if df_idxs is not None:
        df = df.loc[df_idxs]
    #if root_only:          # Should not need in Final Project.
        #df = df[df.is_root]
    return df

def as_padded_array(split='train', max_len=40, pad_id=0,
                    root_only=False, df_idxs=None):
    """Return the dataset as a (padded) NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones
    will be padded with pad_id.
    Args:
      split: 'train' or 'test'
      max_len: maximum sequence length
      pad_id: id to pad shorter sequences with
      root_only: if true, will only export root phrases
      df_idxs: (optional) custom list of indices to export
    Returns: (x, ns, y)
      x: [num_examples, max_len] NumPy array of integer ids
      ns: [num_examples] NumPy array of sequence lengths (<= max_len)
      y: [num_examples] NumPy array of target ids
    """
    df = get_filtered_split(split, df_idxs, root_only)
    x, ns = utils.pad_np_array(df.ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(df.label, dtype=np.int32)

def as_sparse_bow(split='train', root_only=False, df_idxs=None):
    from scipy import sparse
    df = get_filtered_split(split, df_idxs, root_only)
    x = utils.id_lists_to_sparse_bow(df['ids'], self.vocab.size)
    y = np.array(df.label, dtype=np.int32)
    return x, y
'''

#### Construct train, dev, test data arrays  

In [29]:
## Training data

all_train_ids=[]
for i, tokens in enumerate(train_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_train_ids.append(sent_ids)
print(all_train_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
train_x, train_ns = utils.pad_np_array(all_train_ids, max_len=max_len)
print()
print(train_x[:2])
print()
print(train_ns[:2])

train_y = train_labels
print(train_y[:2])

[[14, 9, 22, 7, 73, 4446, 1590, 79, 60, 306, 1257, 5, 145, 34, 9, 22, 6, 9, 3], [10, 4447, 7509, 7, 1070, 1292, 668, 766, 17, 5483, 926, 50, 51, 489, 8, 896, 446, 35, 10, 32, 1552, 3], [4, 108, 7510, 27, 545, 34, 390, 3], [124, 20, 639, 110, 1214, 1648, 29, 9, 22, 90, 9, 3], [7512, 3, 2642, 701, 253, 18, 82, 3, 9, 19, 132, 12, 291, 2643, 102, 12, 328, 3]]

[[  14    9   22    7   73 4446 1590   79   60  306 1257    5  145   34
     9   22    6    9    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  10 4447 7509    7 1070 1292  668  766   17 5483  926   50   51  489
     8  896  446   35   10   32 1552    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]

[19 22]
[1 1]


In [30]:
## Dev data

all_dev_ids=[]
for i, tokens in enumerate(dev_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_dev_ids.append(sent_ids)
print(all_dev_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
dev_x, dev_ns = utils.pad_np_array(all_dev_ids, max_len=max_len)
print()
print(dev_x[:2])
print()
print(dev_ns[:2])

dev_y = dev_labels
print(dev_y[:2])

[[13, 77, 125, 5, 44, 68, 11, 21, 10, 2023, 1057, 65, 1651, 15, 4, 96, 9, 67, 9, 42, 5, 66, 1354, 51, 638, 8, 351, 160, 8, 17, 2023, 3, 16], [127, 4, 267, 463, 15, 4, 691, 118, 4, 322, 44, 6, 754, 7, 4556, 59, 97, 118, 24, 4535, 8, 51, 582, 81, 34, 77, 59, 97, 3], [13, 9623, 4827, 11, 912, 5410, 69, 4, 6248, 7, 282, 6167, 62, 44, 7, 1213, 6398, 3, 16], [4, 1009, 95, 1251, 57, 44, 82, 632, 8, 4, 1208, 3], [1456, 20, 1031, 680, 10, 1104, 965, 34, 3793, 3]]

[[  13   77  125    5   44   68   11   21   10 2023 1057   65 1651   15
     4   96    9   67    9   42    5   66 1354   51  638    8  351  160
     8   17 2023    3   16    0    0    0    0    0    0    0]
 [ 127    4  267  463   15    4  691  118    4  322   44    6  754    7
  4556   59   97  118   24 4535    8   51  582   81   34   77   59   97
     3    0    0    0    0    0    0    0    0    0    0    0]]

[33 29]
[0 0]


In [31]:
## Test data

all_test_ids=[]
for i, tokens in enumerate(test_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_test_ids.append(sent_ids)
print(all_test_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
test_x, test_ns = utils.pad_np_array(all_test_ids, max_len=max_len)
print()
print(test_x[:2])
print()
print(test_ns[:2])

test_y = test_labels
print(test_y[:2])

[[532, 223, 62, 9, 116, 62, 86, 6, 663, 5, 6, 290, 5, 12, 1944, 4, 4932, 477, 3], [14, 45, 20, 149, 9, 2578, 6503, 8, 214, 56, 3355, 48, 4, 120, 138, 1258, 3], [4, 3139, 192, 26, 217, 6, 4, 1303, 1386, 12, 26, 217, 264, 843, 3], [14, 4, 257, 7, 501, 713, 110, 11911, 7, 11912, 12, 4, 4185, 375, 33, 198, 224, 1226, 139, 51, 11913, 1671, 35, 4, 836, 3], [13, 14, 181, 385, 912, 10, 1392, 11, 1101, 6455, 3, 16]]

[[ 532  223   62    9  116   62   86    6  663    5    6  290    5   12
  1944    4 4932  477    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  14   45   20  149    9 2578 6503    8  214   56 3355   48    4  120
   138 1258    3    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]

[19 17]
[1 0]


In [32]:
print("Examples:\n", train_x[:3])
print("Original sequence lengths: ", train_ns[:3])
print("Target labels: ", train_y[:3])
print("")
print("Padded:\n", " ".join(vocab.ids_to_words(train_x[0])))
print("Un-padded:\n", " ".join(vocab.ids_to_words(train_x[0,:train_ns[0]])))

Examples:
 [[  14    9   22    7   73 4446 1590   79   60  306 1257    5  145   34
     9   22    6    9    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  10 4447 7509    7 1070 1292  668  766   17 5483  926   50   51  489
     8  896  446   35   10   32 1552    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   4  108 7510   27  545   34  390    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]
Original sequence lengths:  [19 22  8]
Target labels:  [1 1 0]

Padded:
 says <number> percent of texas physicians accept all new medicaid patients , down from <number> percent in <number> . <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s>
Un-padded:
 says <number> percent of texas physicians accept all new medicaid patients , dow

### Use tf.Estimator API along with nbow_models_x.py 

#### Things to consider:  
- Start w/ 2 epochs (20 was original)       
- Consider use of dropouts in fully-connected layers     
-  Use embed_dim = 300 rather than 50??   
- xx  
...  


In [33]:
print('vocab size:', vocab.size)

vocab size: 14460


In [37]:
## Setup model framework
## (Must specify correct nbow_model_x name in this cell to use the correct nbow_model_x.py file.)

import nbow_model_5; reload(nbow_model_5)

# Specify model hyperparameters as used by model_fn.  Use embed_dim2=74 for all LIWC
### ADD NEW PARAMETER: liwc_dim???

model_params = dict(V=vocab.size, embed_dim=300, hidden_dims=[25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)  # can set optimizer to 'adagrad' or 'adam', which is slower here

checkpoint_dir = "/tmp/tf_nbow_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")
vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=nbow_model_5.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (14,460 words) written to '/tmp/tf_nbow_20181203-0359/metadata.tsv'
Projector config written to /tmp/tf_nbow_20181203-0359/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_nbow_20181203-0359', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdd0a29ea58>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_nbow_20181203-0359' --port 6006

Then in your browser, open: http://localhost

In [38]:
## Train model and Evaluate on Dev data

# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=10, eval_every=1) # start with 2 epochs rather than 20; eval_every=1 (was 2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns}, y=dev_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tf_nbow_20181203-0359/model.ckpt.
INFO:tensorflow:loss = 108.4771, step = 1
INFO:tensorflow:global_step/sec: 151.514
INFO:tensorflow:loss = 68.38412, step = 101 (0.662 sec)
INFO:tensorflow:global_step/sec: 184.833
INFO:tensorflow:loss = 60.927498, step = 201 (0.541 sec)
INFO:tensorflow:global_step/sec: 189.04
INFO:tensorflow:loss = 46.856995, step = 301 (0.529 sec)
INFO:tensorflow:global_step/sec: 188.349
INFO:tensorflow:loss = 44.080967, step = 401 (0.531 sec)
INFO:tensorflow:Saving checkpoints for 405 into /tmp/tf_nbow_20181203-0359/model.ckpt.
INFO:tensorflow:Loss for final step: 10.575054.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at

INFO:tensorflow:Restoring parameters from /tmp/tf_nbow_20181203-0359/model.ckpt-2025
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2025 into /tmp/tf_nbow_20181203-0359/model.ckpt.
INFO:tensorflow:loss = 20.98408, step = 2026
INFO:tensorflow:global_step/sec: 160.922
INFO:tensorflow:loss = 22.141224, step = 2126 (0.623 sec)
INFO:tensorflow:global_step/sec: 190.555
INFO:tensorflow:loss = 26.584892, step = 2226 (0.527 sec)
INFO:tensorflow:global_step/sec: 187.923
INFO:tensorflow:loss = 21.485044, step = 2326 (0.532 sec)
INFO:tensorflow:global_step/sec: 199.607
INFO:tensorflow:loss = 22.993546, step = 2426 (0.502 sec)
INFO:tensorflow:Saving checkpoints for 2430 into /tmp/tf_nbow_20181203-0359/model.ckpt.
INFO:tensorflow:Loss for final step: 4.9619527.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-03-04:00:33
INFO:tensorflow:Graph was finalized

In [39]:
## Evaluate model on (ISOT) Test data

test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")

print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-03-04:01:15
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_nbow_20181203-0359/model.ckpt-4050
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-03-04:01:15
INFO:tensorflow:Saving dict for global step 4050: accuracy = 0.69852144, cross_entropy_loss = 0.59404063, global_step = 4050, loss = 67.47291
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 4050: /tmp/tf_nbow_20181203-0359/model.ckpt-4050
Accuracy on test set: 69.85%


{'accuracy': 0.69852144,
 'cross_entropy_loss': 0.59404063,
 'loss': 67.47291,
 'global_step': 4050}

In [40]:
## We can also evaluate the old-fashioned way, by calling model.predict(...) and working with the predicted labels directly:

from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_nbow_20181203-0359/model.ckpt-4050
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Accuracy on test set: 69.85%


##### Accuracy of 70% is better than the baseline NB result of 62%.  

### Create padded ISOT data and apply prediction function to ISOT data.  



In [53]:
## ISOT data padding

all_isot_ids=[]
for i, tokens in enumerate(isot_title_tokans):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_isot_ids.append(sent_ids)
print(all_isot_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
isot_x, isot_ns = utils.pad_np_array(all_isot_ids, max_len=max_len)
print()
print(isot_x[:2])
print()
print(isot_ns[:2])

isot_y = isot_labels
print(isot_y[:2])

[[2, 831, 2434, 183, 618, 8, 382, 2, 35, 2, 1234, 2, 165, 2859, 363, 18, 2, 425], [6364, 9, 18, 8701, 177, 2, 30, 61, 1364, 329, 8, 14235, 33, 31], [2, 2, 4, 977, 12, 169, 8032, 34, 117, 325, 421, 2, 2, 1120, 3443, 294, 2, 155, 8, 8419, 469, 2, 129, 289, 8421, 5, 254, 145, 374, 1433], [591, 2, 4383, 2, 12000, 2, 431, 1658, 5, 3299, 2, 2], [282, 136, 3248, 3308, 2, 3606, 325, 1466]]

[[    2   831  2434   183   618     8   382     2    35     2  1234     2
    165  2859   363    18     2   425     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [ 6364     9    18  8701   177     2    30    61  1364   329     8 14235
     33    31     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]

[18 14]
[0 0]


### NOTE: STOP!! - This requires manual intervention in the liwc_features function within the nbow_model_5.py file to set the correct LIWC type!!!!!!!!!!!!!!!!!!!


##############################
# PROCEED ONLY AFTER UPDATING nbow_model_5.py file!!!!!
##############################

In [54]:
## Evaluate model on LIAR data

####. S  T. O. P. !!!!  ###

### NOTE: MUST SELECT WHICH LIWC FILE TO USE WITHIN nbow_model_x.py, specifically the liwc_features function.



##############################
# PROCEED ONLY AFTER UPDATING nbow_model_3.py file!!!!!
##############################


reload(nbow_model_5)   ### 

test_input_fn_isot = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": isot_x, "ns": isot_ns}, y=isot_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn_isot, name="test")

print("Accuracy on ISOT set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-03-04:34:22
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_nbow_20181203-0359/model.ckpt-4050
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-03-04:34:25
INFO:tensorflow:Saving dict for global step 4050: accuracy = 0.55229634, cross_entropy_loss = 0.95396733, global_step = 4050, loss = 158.98058
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 4050: /tmp/tf_nbow_20181203-0359/model.ckpt-4050
Accuracy on ISOT set: 55.23%


{'accuracy': 0.55229634,
 'cross_entropy_loss': 0.95396733,
 'loss': 158.98058,
 'global_step': 4050}

#### Prediction accuracy of 55% for ISOT data.  