# LIAR DETECTION GROUP PROJECT

Run the cell below to import packages.

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz


### Load data
Loading the "Fake News" dataset from the Information security and object technology (ISOT) Research lab at the University of Victoria School of Engineering.

The ISOT Fake News Dataset is a compilation of several thousands fake news and truthful articles, obtained from different legitimate news sites and sites flagged as unreliable by politifact.com.

In [2]:
# define each downloaded file
FAKE_FILENAME = 'Fake.csv'
TRUE_FILENAME = 'True.csv'

# define the downloaded file path 
DATAPATH = './datasets/ISOT_FakeNews/'

def get_data(filename):
    '''Read CSV file into a pandas dataframe'''
      
    filepath = DATAPATH + filename
    return pd.read_csv(filepath, header=0, sep=',', quotechar='"')


fake_data = get_data(FAKE_FILENAME)
true_data = get_data(TRUE_FILENAME)



# add a label column to the data with the target values
fake_data.loc[:,'target'] = '0'
true_data['target'] = '1'

#append the datasets and shuffle them
all_data = true_data.append(fake_data, ignore_index=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)

all_data.describe()

Unnamed: 0,title,text,subject,date,target
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",0
freq,14,627.0,11272,182,23481


### Cleanup
Check for NA values.

May not want the dataset to contain the 'subject' since all the true news data comes from "Reuters"

In [3]:
all_data.isna().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [4]:
all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title      44898 non-null object
text       44898 non-null object
subject    44898 non-null object
date       44898 non-null object
target     44898 non-null object
dtypes: object(5)
memory usage: 151.9 MB


### Tokenize and Canonicalize Text

Need to work on Tokenize and Canonicalizing text. Words like "Obama's" need to be corrected. Do we need to mark of sentences within a text? Might want to use some regex code from camron.

In [5]:
"""
Source:  https://gist.github.com/tokestermw/cb87a97113da12acb388
"""

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
   # text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"([A-Z]){2,}", allcaps)

       
    output = text.lower().split()
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in output]))  #Splits punctuation, keeping < and >
    return [item for item in output if item != '']  #Removes blank strings from list

teststring = "My name is Abhishek. I have no clue. Learning the back-portion that I never cared for. Obama's nephew. @random"
tokenize(teststring)

['my',
 'name',
 'is',
 'abhishek',
 '.',
 'i',
 'have',
 'no',
 'clue',
 '.',
 'learning',
 'the',
 'back',
 '-',
 'portion',
 'that',
 'i',
 'never',
 'cared',
 'for',
 '.',
 'obama',
 "'",
 's',
 'nephew',
 '.',
 '<user>']

In [11]:
'''tokenizer, and part-of-speech tagger from Carnegie Mellon
created by Olutobi Owoputi, Brendan O'Connor, Kevin Gimpel, Nathan Schneider, Chris Dyer, Dipanjan Das, Daniel Mills, 
Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and Noah Smith'''
'''RunTagger [options] [ExamplesFilename]
  runs the CMU ARK Twitter tagger on tweets from ExamplesFilename, 
  writing taggings to standard output. Listens on stdin if no input filename.

Options:
  --model <Filename>        Specify model filename. (Else use built-in.)
  --just-tokenize           Only run the tokenizer; no POS tags.
  --quiet                   Quiet: no output
  --input-format <Format>   Default: auto
                            Options: json, text, conll
  --output-format <Format>  Default: automatically decide from input format.
                            Options: pretsv, conll
  --input-field NUM         Default: 1
                            Which tab-separated field contains the input
                            (1-indexed, like unix 'cut')
                            Only for {json, text} input formats.
  --word-clusters <File>    Alternate word clusters file (see FeatureExtractor)
  --no-confidence           Don't output confidence probabilities
  --decoder <Decoder>       Change the decoding algorithm (default: greedy)

Tweet-per-line input formats:
   json: Every input line has a JSON object containing the tweet,
         as per the Streaming API. (The 'text' field is used.)
   text: Every input line has the text for one tweet.
We actually assume input lines are TSV and the tweet data is one field.
(Therefore tab characters are not allowed in tweets.
Twitter's own JSON formats guarantee this;
if you extract the text yourself, you must remove tabs and newlines.)
Tweet-per-line output format is
   pretsv: Prepend the tokenization and tagging as new TSV fields, 
           so the output includes a complete copy of the input.
By default, three TSV fields are prepended:
   Tokenization \t POSTags \t Confidences \t (original data...)
The tokenization and tags are parallel space-separated lists.
The 'conll' format is token-per-line, blank spaces separating tweets.'''

file = open("teststring.txt", "w") 
file.write(teststring) 
file.close() 

#! ./ark-tweet-nlp-0.3.2/runTagger.sh ./ark-tweet-nlp-0.3.2/examples/example_tweets.txt
#! ./ark-tweet-nlp-0.3.2/twokenize.sh --output-format pretsv ./ark-tweet-nlp-0.3.2/examples/casual.txt
test1 = ! ./ark-tweet-nlp-0.3.2/runTagger.sh --output-format conll teststring.txt
test1_list = list([re.split(r'([\t])',x) for x in test1])
test1_list = [[ item for item in word if item != '\t' ] for word in test1_list]
pd_test = pd.DataFrame(test1_list[1:-2], columns = ['word','tag','confidence'] )
pd_test

Unnamed: 0,word,tag,confidence
0,My,D,0.9984
1,name,N,0.9996
2,is,V,0.9973
3,Abhishek,^,0.9628
4,.,",",0.9975
5,I,O,0.998
6,have,V,0.9999
7,no,D,0.9911
8,clue,N,0.9998
9,.,",",0.9985


In [116]:
#Make new column with tokenized, canonicalized text
all_data['text_tokcan'] = all_data['text'].apply(tokenize)
all_data.tail(5)

Unnamed: 0,title,text,subject,date,target,text_tokcan
44893,U.N. political affairs chief to visit North Ko...,UNITED NATIONS (Reuters) - The United Nations ...,worldnews,"December 4, 2017",1,"[united, <allcaps>, nations, <allcaps>, (, reu..."
44894,NAILS IT! MIKE ROWE On Why Trump Won…Hillary S...,This is fantastic! Mike Rowe tells a fan why T...,left-news,"Nov 11, 2016",0,"[this, is, fantastic, !, mike, rowe, tells, a,..."
44895,Coalition of 13 states to challenge Trump on v...,WASHINGTON (Reuters) - New York State’s attorn...,politicsNews,"June 9, 2017",1,"[washington, <allcaps>, (, reuters, ), -, new,..."
44896,VERY FUNNY VIDEO: SARAH PALIN ADVISES TRUMP TO...,Just released hysterical video of Sarah Palin...,politics,"Aug 14, 2015",0,"[just, released, hysterical, video, of, sarah,..."
44897,Indicted Texas mayor arrested for disrupting m...,SAN ANTONIO (Reuters) - The mayor of the south...,politicsNews,"February 17, 2016",1,"[san, <allcaps>, antonio, <allcaps>, (, reuter..."


In [13]:

def build_vocab(corpus, V=None, **kw):
    if isinstance(corpus, list):
        token_feed = (utils.canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab


#utils.canonicalize_word(teststring.split())
vocab=build_vocab(tokenize(teststring))
print("{:,} words".format(vocab.size))
print("wordset: ",vocab.ordered_words())



Vocabulary: 26 types
26 words
wordset:  ['<s>', '</s>', '<unk>', '.', 'i', 'my', 'name', 'is', 'abhishek', 'have', 'no', 'clue', 'learning', 'the', 'back', '-', 'portion', 'that', 'never', 'cared', 'for', 'obama', "'", 's', 'nephew', '<user>']


### Train/Dev/Test Split

In [6]:
#train/dev/train split
#train_dev_split = 0.8

train_fract = 0.70
dev_fract = 0.15
test_fract = 0.15

if (train_fract+dev_fract+test_fract) == 1.0:
    print('Split fractions add up to 1.0')
else:
    print('SPLIT FRACTIONS DO NOT ADD UP TO 1.0; PLEASE TRY AGAIN.............')

#train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
#dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

train_set = all_data[ :int(len(all_data)*train_fract)].reset_index(drop=True)
dev_set = all_data[int(len(all_data)*(train_fract)) : int(len(all_data)*(train_fract+dev_fract))].reset_index(drop=True)
test_set = all_data[int(len(all_data)*(train_fract+dev_fract)) : ].reset_index(drop=True)

print('training set: ',train_set.shape)
print('dev set: ',dev_set.shape)
print('test set: ',test_set.shape)

Split fractions add up to 1.0
training set:  (31428, 5)
dev set:  (6735, 5)
test set:  (6735, 5)


In [7]:
train_set.head(5)

Unnamed: 0,title,text,subject,date,target
0,TRUMP WINS! Supreme Court Rules On Travel Ban ...,This is gonna be a tough pill for the left to ...,politics,"Jun 26, 2017",0
1,U.S. State Department says 'very concerned' ab...,WASHINGTON (Reuters) - The U.S. State Departme...,worldnews,"October 16, 2017",1
2,Head of Germany's FDP offers Macron 'bitterswe...,BERLIN (Reuters) - A leading candidate to be G...,worldnews,"October 19, 2017",1
3,Pro-Damascus alliance declares Syria offensive...,BEIRUT (Reuters) - A military alliance fightin...,worldnews,"September 16, 2017",1
4,Trump’s Labor Pick Belonged To Group That FOR...,"Andrew Puzder, Trump s nominee for Secretary o...",News,"January 25, 2017",0


### Sandbox

delete eveything below when notebook complete

In [76]:
#df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
#df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))



all_data.iloc[1]['text']



'NEW DELHI (Reuters) - Donald Trump sympathizes with India in its recent escalation of tensions with Pakistan and supports skilled immigration, an adviser said on Friday, portraying the U.S. presidential hopeful as a friend of India and Indian Americans. Trump, a real estate billionaire, has earned a reputation of hostility toward minorities with proposals such as “extreme vetting” of potential immigrants and building a wall along the Mexican border to stop illegal immigration.  The Republican nominee has proposed a ban on immigration from countries where vetting would be difficult, such as nations faced with Islamic militancy. Some Indian officials worry the United States could become more isolationist under Trump, leaving allies like New Delhi without the support it has enjoyed under President Barack Obama against China’s growing regional influence.      Shalabh Kumar, a Chicago-based businessman of Indian origin tasked by the Trump campaign with reaching out to Asian-Americans, said

In [52]:
#re.split(r'([^\w<>])', teststring)
list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in test1]))

['Detected',
 ' ',
 'text',
 ' ',
 'input',
 ' ',
 'format',
 'My',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9984',
 'name',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '9996',
 'is',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9973',
 'Abhishek',
 '\t',
 '',
 '^',
 '',
 '\t',
 '0',
 '.',
 '9628',
 '',
 '.',
 '',
 '\t',
 '',
 ',',
 '',
 '\t',
 '0',
 '.',
 '9975',
 'I',
 '\t',
 'O',
 '\t',
 '0',
 '.',
 '9980',
 'have',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9999',
 'no',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9911',
 'clue',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '9998',
 '',
 '.',
 '',
 '\t',
 '',
 ',',
 '',
 '\t',
 '0',
 '.',
 '9985',
 'Learning',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9957',
 'the',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9960',
 'back',
 '-',
 'portion',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '8394',
 'that',
 '\t',
 'P',
 '\t',
 '0',
 '.',
 '9530',
 'I',
 '\t',
 'O',
 '\t',
 '0',
 '.',
 '9989',
 'never',
 '\t',
 'R',
 '\t',
 '0',
 '.',
 '9922',
 'cared',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9976',
 'for',
 '\t',
 'P',
 '\t',
 '

In [109]:
test1_list = list([re.split(r'([\t])',x) for x in test1])
test1_list = [[ item for item in word if item != '\t' ] for word in test1_list]


[['My', 'D', '0.9984'],
 ['name', 'N', '0.9996'],
 ['is', 'V', '0.9973'],
 ['Abhishek', '^', '0.9628'],
 ['.', ',', '0.9975'],
 ['I', 'O', '0.9980'],
 ['have', 'V', '0.9999'],
 ['no', 'D', '0.9911'],
 ['clue', 'N', '0.9998'],
 ['.', ',', '0.9985'],
 ['Learning', 'V', '0.9957'],
 ['the', 'D', '0.9960'],
 ['back-portion', 'N', '0.8394'],
 ['that', 'P', '0.9530'],
 ['I', 'O', '0.9989'],
 ['never', 'R', '0.9922'],
 ['cared', 'V', '0.9976'],
 ['for', 'P', '0.9806'],
 ['.', ',', '0.9916'],
 ["Obama's", 'Z', '0.8890'],
 ['nephew', 'N', '0.9582'],
 ['.', ',', '0.9976'],
 ['@random', '@', '0.9960']]

In [14]:
#pdtest2 = pd.DataFrame(test1_list[1:-2], columns = ['word','tag','confidence'] )
pd_test['tag'].tolist()

['D',
 'N',
 'V',
 '^',
 ',',
 'O',
 'V',
 'D',
 'N',
 ',',
 'V',
 'D',
 'N',
 'P',
 'O',
 'R',
 'V',
 'P',
 ',',
 'Z',
 'N',
 ',',
 '@']