# LIAR DETECTION GROUP PROJECT - Neural BOW Models  


### CONTENTS  

Imports  
Load ISOT data from appropriate pickle file  
Load ISOT vocabulary from pickle file  (note: vocab contains both "title" and "text" words)  
Train/Dev/Test split ISOT data  
Load LIAR data (for evaluating models)  

#### Neural BOW Models:
- Model_1: Initial run replicating settings from Assignment 2, but with ISOT "title" data. 
- Model_1a: Same as Model_1 except don't sum the xs_'s.  NOT YET IMPLEMENTED...




    

In [None]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from functools import reduce
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
#assert(tf.__version__.startswith("1.8"))

import pickle
import dill
# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
from w266_common import patched_numpy_io
import timeit  #For timing


In [None]:
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
print('TensorFlow version:', tf.VERSION)

### Load ISOT data and vocabulary from pickle files  
Loading the dataset from the Information security and object technology (ISOT) Research lab at the University of Victoria School of Engineering.

The ISOT Fake News Dataset is a compilation of several thousands fake news and truthful articles, obtained from different legitimate news sites and sites flagged as unreliable by politifact.com.

In [None]:
# Read ISOT data from pickle file.
all_data = pd.read_pickle('parsed_data/df_alldata2.pkl')  # ISOT data (CMU) tokenized and POS tags added
all_data.info(memory_usage='deep', verbose=True)

In [None]:
all_data.head()

In [None]:
all_data.title[0]

In [None]:
all_data.title_tokcan[0]

In [None]:
# Read ISOT vocab from pickle file.

vocab = pd.read_pickle('parsed_data/vocab.pkl')  # ISOT data (CMU) tokenized and POS tags added

In [None]:
print("{:,} words".format(vocab.size))  # Note: this combines words from ISOT "title" AND "text" fields!
print("wordset: ",vocab.ordered_words()[:30])
print(vocab)

In [None]:
print('ISOT ALL target=real:', len(all_data.target[all_data.target == '1']))
print('ISOT ALL target=fake:', len(all_data.target[all_data.target == '0']))

### Train / Dev / Test Split ISOT data

In [None]:
#train/dev/train split
#train_dev_split = 0.8

train_fract = 0.70
dev_fract = 0.15
test_fract = 0.15

if (train_fract+dev_fract+test_fract) == 1.0:
    print('Split fractions add up to 1.0')
else:
    print('SPLIT FRACTIONS DO NOT ADD UP TO 1.0; PLEASE TRY AGAIN.............')

#train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
#dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

train_set = all_data[ :int(len(all_data)*train_fract)].reset_index(drop=True)
dev_set = all_data[int(len(all_data)*(train_fract)) : int(len(all_data)*(train_fract+dev_fract))].reset_index(drop=True)
test_set = all_data[int(len(all_data)*(train_fract+dev_fract)) : ].reset_index(drop=True)

print('training set: ',train_set.shape)
print('dev set: ',dev_set.shape)
print('test set: ',test_set.shape)

In [None]:
train_set.head()

In [None]:
dev_set.head()

In [None]:
test_set.head()

### Select ISOT features and labels for training model 

In [None]:
train_data, train_labels = train_set.title_tokcan.values, train_set.target.values
dev_data, dev_labels = dev_set.title_tokcan.values, dev_set.target.values
test_data, test_labels = test_set.title_tokcan.values, test_set.target.values

train_labels = train_labels.astype(int)
dev_labels = dev_labels.astype(int)
test_labels = test_labels.astype(int)

#train_data.head()
print('train_data shape:', train_data.shape)
#print(train_data[0].shape)
print(train_data[:1])
print('train_labels shape:', train_labels.shape)
print(train_labels)
print()
print('dev_data shape:', dev_data.shape)
print(dev_data[:1])
print('dev_labels shape:', dev_labels.shape)
print(dev_labels)
print()
print('test_data shape:', test_data.shape)
print(test_data[:1])
print('test_labels shape:', test_labels.shape)
print(test_labels)


In [None]:
# characterize length of documents in train_data

lengths = [len(train_data[i]) for i in range(train_data.shape[0])]

a = np.array(lengths)
p = np.percentile(a, 95) # return 95th percentile
print('95th percentile:', p)

In [None]:
# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

# Helper code for plotting histograms
def plot_length_histogram(lengths, x_range=[0,100], bins=40, normed=True):
    hist, bin_edges = np.histogram(a=lengths, bins=bins, normed=normed, range=x_range)
    bin_centers = (bin_edges[1:] + bin_edges[:-1])/2
    bin_widths =  (bin_edges[1:] - bin_edges[:-1])

    hover = HoverTool(tooltips=[("bucket", "@x"), ("count", "@top")], mode="vline")
    fig = bp.figure(plot_width=800, plot_height=400, tools=[hover])
    fig.vbar(x=bin_centers, width=bin_widths, top=hist, hover_fill_color="firebrick")
    fig.y_range.start = 0
    fig.x_range.start = 0
    fig.xaxis.axis_label = "Example length (number of tokens)"
    fig.yaxis.axis_label = "Frequency"
    bp.show(fig)

In [None]:
plot_length_histogram(lengths)

### Load LIAR data to evaluate various models below.  

In [None]:
# DON"T Read LIAR data from pickle file. ****************
#liar_data = pd.read_pickle('parsed_data/df_liardata2.pkl')  # data (CMU) tokenized and POS tags added
# Heads up on the df_liardata2.pkl: it looks like during the process, 
# "mostly-false" was used in place of "barely_true".  
# I believe it means that the "barely_true" items were omitted from the pickled file.
# ^^^^^^^^^^^


#### USE THIS ONE WHEN AVAILABLE!!!!!!!!!!!
liar_data = pd.read_pickle('parsed_data/df_liardata2binary.pkl')  # data (CMU) tokenized and POS tags added



liar_data.info(memory_usage='deep', verbose=True)

In [None]:
liar_data.head(10)

In [None]:
binary_targets = liar_data.binary_target.unique()
print(binary_targets)

print('\nbinary_target,  number of examples')
for binary_target in binary_targets:
    print(binary_target, len(liar_data[liar_data.binary_target==binary_target]))

In [None]:
liar_data_binary = liar_data[liar_data.binary_target >= 0]  ## discard "half-true"!!!!
liar_data_binary.head(10)

In [None]:
liar_data_binary = liar_data_binary.reset_index(drop=True)
liar_data_binary.head(10)

In [None]:
print('LIAR true:', len(liar_data[liar_data.binary_target == 1]))
print('LIAR false:', len(liar_data[liar_data.binary_target == 0]))

In [None]:
liar_title_tokans = liar_data_binary.title_tokcan.values
liar_labels = liar_data_binary.binary_target.values

print('liar titles:', liar_title_tokans)
print('liar labels:', liar_labels)

## Neural BOW Model 1: ISOT "title" data without GloVe embeddings

#### Add in reference functions for viewing convenience

In [None]:
# May need this info (from utils.py)
'''
def build_vocab(corpus, V=10000, **kw):
    from . import vocabulary
    if isinstance(corpus, list):
        token_feed = (canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    else:
        token_feed = (canonicalize_word(w) for w in corpus.words())
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)

    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab

# Window and batch functions
def pad_np_array(example_ids, max_len=250, pad_id=0):
    """Pad a list of lists of ids into a rectangular NumPy array.

    Longer sequences will be truncated to max_len ids, while shorter ones will
    be padded with pad_id.

    Args:
        example_ids: list(list(int)), sequence of ids for each example
        max_len: maximum sequence length
        pad_id: id to pad shorter sequences with

    Returns: (x, ns)
        x: [num_examples, max_len] NumPy array of integer ids
        ns: [num_examples] NumPy array of sequence lengths (<= max_len)
    """
    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def id_lists_to_sparse_bow(id_lists, vocab_size):
    """Convert a list-of-lists-of-ids to a sparse bag-of-words matrix.

    Args:
        id_lists: (list(list(int))) list of lists of word ids
        vocab_size: (int) vocab size; must be greater than the largest word id
            in id_lists.

    Returns:
        (scipy.sparse.csr_matrix) where each row is a sparse vector of word
        counts for the corresponding example.
    """
    from scipy import sparse
    ii = []  # row indices (example ids)
    jj = []  # column indices (token ids)
    for row_id, ids in enumerate(id_lists):
        ii.extend([row_id]*len(ids))
        jj.extend(ids)
    x = sparse.csr_matrix((np.ones_like(ii), (ii, jj)),
                          shape=[len(id_lists), vocab_size])
    return x
'''

In [None]:
# These are functions that were in the "SSTDataset" class in sst.py from A2
'''
def get_filtered_split(split='train', df_idxs=None, root_only=False):
    if not hasattr(split):
        raise ValueError("Invalid split name '%s'" % name)
    df = getattr(split)
    if df_idxs is not None:
        df = df.loc[df_idxs]
    #if root_only:          # Should not need in Final Project.
        #df = df[df.is_root]
    return df

def as_padded_array(split='train', max_len=40, pad_id=0,
                    root_only=False, df_idxs=None):
    """Return the dataset as a (padded) NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones
    will be padded with pad_id.
    Args:
      split: 'train' or 'test'
      max_len: maximum sequence length
      pad_id: id to pad shorter sequences with
      root_only: if true, will only export root phrases
      df_idxs: (optional) custom list of indices to export
    Returns: (x, ns, y)
      x: [num_examples, max_len] NumPy array of integer ids
      ns: [num_examples] NumPy array of sequence lengths (<= max_len)
      y: [num_examples] NumPy array of target ids
    """
    df = get_filtered_split(split, df_idxs, root_only)
    x, ns = utils.pad_np_array(df.ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(df.label, dtype=np.int32)

def as_sparse_bow(split='train', root_only=False, df_idxs=None):
    from scipy import sparse
    df = get_filtered_split(split, df_idxs, root_only)
    x = utils.id_lists_to_sparse_bow(df['ids'], self.vocab.size)
    y = np.array(df.label, dtype=np.int32)
    return x, y
'''

#### Construct train, dev, test data arrays  

In [None]:
## Training data

all_train_ids=[]
for i, tokens in enumerate(train_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_train_ids.append(sent_ids)
print(all_train_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
train_x, train_ns = utils.pad_np_array(all_train_ids, max_len=max_len)
print()
print(train_x[:2])
print()
print(train_ns[:2])

train_y = train_labels
print(train_y[:2])

In [None]:
## Dev data

all_dev_ids=[]
for i, tokens in enumerate(dev_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_dev_ids.append(sent_ids)
print(all_dev_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
dev_x, dev_ns = utils.pad_np_array(all_dev_ids, max_len=max_len)
print()
print(dev_x[:2])
print()
print(dev_ns[:2])

dev_y = dev_labels
print(dev_y[:2])

In [None]:
## Test data

all_test_ids=[]
for i, tokens in enumerate(test_data):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_test_ids.append(sent_ids)
print(all_test_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
test_x, test_ns = utils.pad_np_array(all_test_ids, max_len=max_len)
print()
print(test_x[:2])
print()
print(test_ns[:2])

test_y = test_labels
print(test_y[:2])

In [None]:
print("Examples:\n", train_x[:3])
print("Original sequence lengths: ", train_ns[:3])
print("Target labels: ", train_y[:3])
print("")
print("Padded:\n", " ".join(vocab.ids_to_words(train_x[0])))
print("Un-padded:\n", " ".join(vocab.ids_to_words(train_x[0,:train_ns[0]])))

### Use tf.Estimator API along with nbow_models.py 

#### Things to consider:  
- Start w/ 2 epochs (20 was original)       
- Consider use of dropouts in fully-connected layers     
-  Use embed_dim = 300 rather than 50??  
- Try to fix Tensorboard display issue (http://localhost:6006 not working.  "This site can't be reached; ERR_CONNECTION_REFUSED)  
- xx  
...  


In [None]:
## Setup model framework

import nbow_model_1; reload(nbow_model_1)

# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size, embed_dim=50, hidden_dims=[25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)  # can set optimizer to 'adagrad' or 'adam', which is slower here

checkpoint_dir = "/tmp/tf_nbow_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")
vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=nbow_model_1.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

In [None]:
## Train model and Evaluate on Dev data

# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=10, eval_every=1) # start with 2 epochs rather than 20; eval_every=1 (was 2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns}, y=dev_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

In [None]:
## Evaluate model on (ISOT) Test data

test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")

print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

In [None]:
## We can also evaluate the old-fashioned way, by calling model.predict(...) and working with the predicted labels directly:

from sklearn.metrics import accuracy_score
predictions = list(model.predict(test_input_fn))  # list of dicts
y_pred = [p['max'] for p in predictions]
acc = accuracy_score(y_pred, test_y)
print("Accuracy on test set: {:.02%}".format(acc))

##### Accuracy on the ISOT test set is higher for Neural BOW model compared to baseline NB model,  98% vs. 95%.    (Note that we achieve the same accuracy with Neural BOW on the ISOT set regardless of whether we use 50 or 300 for the embedding dimension.  

#### Create padded LIAR data and apply prediction function to LIAR data.

In [None]:
## LIAR data padding

all_liar_ids=[]
for i, tokens in enumerate(liar_title_tokans):  # here, tokens are the words in a single sentence
    sent_ids = vocab.words_to_ids(tokens)
    all_liar_ids.append(sent_ids)
print(all_liar_ids[:5])

max_len = 40   # Retain this setting, since it fits the ISOT "title" length distribution quite well.
liar_x, liar_ns = utils.pad_np_array(all_liar_ids, max_len=max_len)
print()
print(liar_x[:2])
print()
print(liar_ns[:2])

liar_y = liar_labels
print(liar_y[:2])

In [None]:
## Evaluate model on LIAR data

test_input_fn_liar = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": liar_x, "ns": liar_ns}, y=liar_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn_liar, name="test")

print("Accuracy on LIAR set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

##### Prediction accuracy on LIAR data is poor, and slightly worse relative to the baseline MB model, 52% vs. 53%.    