# LIAR DETECTION GROUP PROJECT

Run the cell below to import packages.

In [7]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz



### Load data
Loading the "Fake News" dataset from the Information security and object technology (ISOT) Research lab at the University of Victoria School of Engineering.

The ISOT Fake News Dataset is a compilation of several thousands fake news and truthful articles, obtained from different legitimate news sites and sites flagged as unreliable by politifact.com.

In [57]:
# define each downloaded file
FAKE_FILENAME = 'fake.csv'
TRUE_FILENAME = 'true.csv'

# define the downloaded file path 
DATAPATH = './datasets/ISOT FakeNews/'

def get_data(filename):
    '''Read CSV file into a pandas dataframe'''
      
    filepath = DATAPATH + filename
    return pd.read_csv(filepath, header=0, sep=',', quotechar='"')


fake_data = get_data(FAKE_FILENAME)
true_data = get_data(TRUE_FILENAME)



# add a label column to the data
fake_data.loc[:,'target'] = '0'
true_data['target'] = '1'

#append the datasets and shuffle them
all_data = true_data.append(fake_data, ignore_index=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)

all_data.describe()

Unnamed: 0,title,text,subject,date,target
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",0
freq,14,627.0,11272,182,23481


### Cleanup
Check for NA values.

May not want the dataset to contain the 'subject' since all the true news data comes from "Reuters"

In [71]:
all_data.isna().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [72]:
all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title      44898 non-null object
text       44898 non-null object
subject    44898 non-null object
date       44898 non-null object
target     44898 non-null object
dtypes: object(5)
memory usage: 151.8 MB


### Tokenize and Canonicalize Text

Need to work on Tokenize and Canonicalizing text. Words like "Obama's" need to be corrected. Do we need to mark of sentences within a text? Might want to use some regex code from camron.

In [101]:
def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"

def build_vocab(corpus, V=None, **kw):
    if isinstance(corpus, list):
        token_feed = (utils.canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab

teststring = 'My name is Abhishek. I have no clue. Learning the back-portion that I never cared for. Obama''s nephew.'
#utils.canonicalize_word(teststring.split())
vocab=build_vocab(teststring.split())
print("{:,} words".format(vocab.size))
print("wordset: ",vocab.ordered_words())



Vocabulary: 20 types
20 words
wordset:  ['<s>', '</s>', '<unk>', 'i', 'my', 'name', 'is', 'abhishek.', 'have', 'no', 'clue.', 'learning', 'the', 'back-portion', 'that', 'never', 'cared', 'for.', 'obamas', 'nephew.']


### Train and Test Split

In [69]:
#train and dev split
train_dev_split = 0.8

train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

print('training data: ',train_data.shape)
print('dev data: ',dev_data.shape)

training data:  (35918, 5)
dev data:  (8980, 5)


In [70]:
train_data.head(5)

Unnamed: 0,title,text,subject,date,target
0,U.S. Senate support for Trump education nomine...,WASHINGTON (Reuters) - Public refusals on Wedn...,politicsNews,"February 1, 2017",1
1,Trump's Indian-American fan spreads the word b...,NEW DELHI (Reuters) - Donald Trump sympathizes...,politicsNews,"October 7, 2016",1
2,ERIC HOLDER DOUBLES DOWN with Second Threat Ag...,Eric Holder just doubled down on threats this ...,politics,"Dec 17, 2017",0
3,U.S. plans to update self-driving guidelines i...,DETROIT/WASHINGTON (Reuters) - President Donal...,politicsNews,"June 5, 2017",1
4,"Leaks of Manchester bomb probe 'reprehensible,...",LONDON (Reuters) - The most senior U.S. diplom...,politicsNews,"May 25, 2017",1


### Sandbox

delete eveything below when notebook complete

In [76]:
#df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
#df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))



all_data.iloc[1]['text']



'NEW DELHI (Reuters) - Donald Trump sympathizes with India in its recent escalation of tensions with Pakistan and supports skilled immigration, an adviser said on Friday, portraying the U.S. presidential hopeful as a friend of India and Indian Americans. Trump, a real estate billionaire, has earned a reputation of hostility toward minorities with proposals such as “extreme vetting” of potential immigrants and building a wall along the Mexican border to stop illegal immigration.  The Republican nominee has proposed a ban on immigration from countries where vetting would be difficult, such as nations faced with Islamic militancy. Some Indian officials worry the United States could become more isolationist under Trump, leaving allies like New Delhi without the support it has enjoyed under President Barack Obama against China’s growing regional influence.      Shalabh Kumar, a Chicago-based businessman of Indian origin tasked by the Trump campaign with reaching out to Asian-Americans, said