# LIAR DETECTION GROUP PROJECT

Run the cell below to import packages.

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
#from ark-tweet-nlp-0.3.2 import 


### Load data
Loading the "Fake News" dataset from the Information security and object technology (ISOT) Research lab at the University of Victoria School of Engineering.

The ISOT Fake News Dataset is a compilation of several thousands fake news and truthful articles, obtained from different legitimate news sites and sites flagged as unreliable by politifact.com.

In [2]:
# define each downloaded file
FAKE_FILENAME = 'Fake.csv'
TRUE_FILENAME = 'True.csv'

# define the downloaded file path 
DATAPATH = './datasets/ISOT_FakeNews/'

def get_data(filename):
    '''Read CSV file into a pandas dataframe'''
      
    filepath = DATAPATH + filename
    return pd.read_csv(filepath, header=0, sep=',', quotechar='"')


fake_data = get_data(FAKE_FILENAME)
true_data = get_data(TRUE_FILENAME)



# add a label column to the data with the target values
fake_data.loc[:,'target'] = '0'
true_data['target'] = '1'

#append the datasets and shuffle them
all_data = true_data.append(fake_data, ignore_index=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)

all_data.describe()

Unnamed: 0,title,text,subject,date,target
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",0
freq,14,627.0,11272,182,23481


In [3]:
#fake_data.head(15)
#true_data.head(16)
all_data.head(15)

Unnamed: 0,title,text,subject,date,target
0,Trump to nominate Raytheon lobbyist for Army s...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"July 19, 2017",1
1,Thailand approves $2.2 billion in help for ric...,BANGKOK (Reuters) - Thailand s government on F...,worldnews,"September 1, 2017",1
2,White House confident it will 'prevail' on tra...,ON BOARD AIR FORCE ONE (Reuters) - The White H...,politicsNews,"February 6, 2017",1
3,Republican establishment bails on Alabama cand...,(Reuters) - The national campaign wing for U.S...,politicsNews,"November 10, 2017",1
4,Trump’s Administration Was Just Caught Up In ...,While it can be said that no one is immune fro...,News,"April 5, 2017",0
5,A young Chinese rebel feels the pull of family...,"CHENGDU, China (Reuters) - Her stepfather was ...",worldnews,"October 17, 2017",1
6,Democratic National Committee apologizes to Sa...,(Reuters) - The Democratic National Committee ...,politicsNews,"July 25, 2016",1
7,The Children Of Christian Fundamentalists Are...,Christian Fundamentalism is putting American c...,News,"March 9, 2016",0
8,U.S. urges Venezuela to release U.S. citizen h...,WASHINGTON (Reuters) - The U.S. State Departme...,worldnews,"November 30, 2017",1
9,Factbox: Trump on Twitter (Aug 1) - Stock mark...,The following statements were posted to the ve...,politicsNews,"August 1, 2017",1


In [4]:

print(fake_data.title[0])
print('\n', fake_data.text[0])

 Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing

 Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this

### Cleanup
Check for NA values.

May not want the dataset to contain the 'subject' since all the true news data comes from "Reuters"

In [5]:
all_data.isna().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [6]:
all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title      44898 non-null object
text       44898 non-null object
subject    44898 non-null object
date       44898 non-null object
target     44898 non-null object
dtypes: object(5)
memory usage: 151.9 MB


### Tokenize and Canonicalize Text

Need to work on Tokenize and Canonicalizing text. Words like "Obama's" need to be corrected. Do we need to mark of sentences within a text? Might want to use some regex code from camron.

In [7]:
"""
Source:  https://gist.github.com/tokestermw/cb87a97113da12acb388
"""

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
   # text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"([A-Z]){2,}", allcaps)

       
    output = text.lower().split()
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in output]))  #Splits punctuation, keeping < and >
    return [item for item in output if item != '']  #Removes blank strings from list

teststring = "My name is Abhishek. I have no clue. Learning the back-portion that I never cared for. Obama's nephew. @random"
tokenize(teststring)

['my',
 'name',
 'is',
 'abhishek',
 '.',
 'i',
 'have',
 'no',
 'clue',
 '.',
 'learning',
 'the',
 'back',
 '-',
 'portion',
 'that',
 'i',
 'never',
 'cared',
 'for',
 '.',
 'obama',
 "'",
 's',
 'nephew',
 '.',
 '<user>']

In [111]:
'''tokenizer, and part-of-speech tagger from Carnegie Mellon
created by Olutobi Owoputi, Brendan O'Connor, Kevin Gimpel, Nathan Schneider, Chris Dyer, Dipanjan Das, Daniel Mills, 
Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and Noah Smith'''
'''RunTagger [options] [ExamplesFilename]
  runs the CMU ARK Twitter tagger on tweets from ExamplesFilename, 
  writing taggings to standard output. Listens on stdin if no input filename.

Options:
  --model <Filename>        Specify model filename. (Else use built-in.)
  --just-tokenize           Only run the tokenizer; no POS tags.
  --quiet                   Quiet: no output
  --input-format <Format>   Default: auto
                            Options: json, text, conll
  --output-format <Format>  Default: automatically decide from input format.
                            Options: pretsv, conll
  --input-field NUM         Default: 1
                            Which tab-separated field contains the input
                            (1-indexed, like unix 'cut')
                            Only for {json, text} input formats.
  --word-clusters <File>    Alternate word clusters file (see FeatureExtractor)
  --no-confidence           Don't output confidence probabilities
  --decoder <Decoder>       Change the decoding algorithm (default: greedy)

Tweet-per-line input formats:
   json: Every input line has a JSON object containing the tweet,
         as per the Streaming API. (The 'text' field is used.)
   text: Every input line has the text for one tweet.
We actually assume input lines are TSV and the tweet data is one field.
(Therefore tab characters are not allowed in tweets.
Twitter's own JSON formats guarantee this;
if you extract the text yourself, you must remove tabs and newlines.)
Tweet-per-line output format is
   pretsv: Prepend the tokenization and tagging as new TSV fields, 
           so the output includes a complete copy of the input.
By default, three TSV fields are prepended:
   Tokenization \t POSTags \t Confidences \t (original data...)
The tokenization and tags are parallel space-separated lists.
The 'conll' format is token-per-line, blank spaces separating tweets.'''

file = open("teststring.txt", "w") 
file.write(teststring) 
file.close() 

#! ./ark-tweet-nlp-0.3.2/runTagger.sh ./ark-tweet-nlp-0.3.2/examples/example_tweets.txt
#! ./ark-tweet-nlp-0.3.2/twokenize.sh --output-format pretsv ./ark-tweet-nlp-0.3.2/examples/casual.txt
test1 = ! ./ark-tweet-nlp-0.3.2/runTagger.sh --output-format conll teststring.txt
test1_list = list([re.split(r'([\t])',x) for x in test1])
test1_list = [[ item for item in word if item != '\t' ] for word in test1_list]
pd_test = pd.DataFrame(test1_list[1:-2], columns = ['word','tag','confidence'] )
pd_test

Unnamed: 0,word,tag,confidence
0,My,D,0.9984
1,name,N,0.9996
2,is,V,0.9973
3,Abhishek,^,0.9628
4,.,",",0.9975
5,I,O,0.998
6,have,V,0.9999
7,no,D,0.9911
8,clue,N,0.9998
9,.,",",0.9985


In [8]:
#Make new column with tokenized, canonicalized text
all_data['text_tokcan'] = all_data['text'].apply(tokenize)
all_data.tail(5)

Unnamed: 0,title,text,subject,date,target,text_tokcan
44893,Trump's tax cut won't be the biggest in U.S. h...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"November 2, 2017",1,"[washington, <allcaps>, (, reuters, ), -, pres..."
44894,U.S. House ethics panel investigating allegati...,WASHINGTON (Reuters) - The U.S. House of Repre...,politicsNews,"November 21, 2017",1,"[washington, <allcaps>, (, reuters, ), -, the,..."
44895,"U.S., EU set meeting on airline security, elec...",WASHINGTON/BRUSSELS (Reuters) - U.S. and Europ...,politicsNews,"May 12, 2017",1,"[washington, <allcaps>, /, brussels, <allcaps>..."
44896,German coalition talks: 'Road to Jamaica is long',BERLIN (Reuters) - German politicians seeking ...,worldnews,"October 18, 2017",1,"[berlin, <allcaps>, (, reuters, ), -, german, ..."
44897,SAY WHAT? #BlackLivesMatter TEXTBOOKS TO BE US...,When will American citizens stop being afraid ...,politics,"Aug 24, 2015",0,"[when, will, american, citizens, stop, being, ..."


In [9]:

def build_vocab(corpus, V=None, **kw):
    if isinstance(corpus, list):
        token_feed = (utils.canonicalize_word(w) for w in corpus)
        vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
    print("Vocabulary: {:,} types".format(vocab.size))
    return vocab


#utils.canonicalize_word(teststring.split())
vocab=build_vocab(tokenize(teststring))
print("{:,} words".format(vocab.size))
print("wordset: ",vocab.ordered_words())



Vocabulary: 26 types
26 words
wordset:  ['<s>', '</s>', '<unk>', '.', 'i', 'my', 'name', 'is', 'abhishek', 'have', 'no', 'clue', 'learning', 'the', 'back', '-', 'portion', 'that', 'never', 'cared', 'for', 'obama', "'", 's', 'nephew', '<user>']


### Train / Dev / Test Split

In [10]:
#train/dev/train split
#train_dev_split = 0.8

train_fract = 0.70
dev_fract = 0.15
test_fract = 0.15

if (train_fract+dev_fract+test_fract) == 1.0:
    print('Split fractions add up to 1.0')
else:
    print('SPLIT FRACTIONS DO NOT ADD UP TO 1.0; PLEASE TRY AGAIN.............')

#train_data = all_data[:int(len(all_data)*train_dev_split)].reset_index(drop=True)
#dev_data = all_data[int(len(all_data)*train_dev_split):].reset_index(drop=True)

train_set = all_data[ :int(len(all_data)*train_fract)].reset_index(drop=True)
dev_set = all_data[int(len(all_data)*(train_fract)) : int(len(all_data)*(train_fract+dev_fract))].reset_index(drop=True)
test_set = all_data[int(len(all_data)*(train_fract+dev_fract)) : ].reset_index(drop=True)

print('training set: ',train_set.shape)
print('dev set: ',dev_set.shape)
print('test set: ',test_set.shape)

Split fractions add up to 1.0
training set:  (31428, 6)
dev set:  (6735, 6)
test set:  (6735, 6)


In [11]:
train_set.head(5)

Unnamed: 0,title,text,subject,date,target,text_tokcan
0,Trump to nominate Raytheon lobbyist for Army s...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"July 19, 2017",1,"[washington, <allcaps>, (, reuters, ), -, u, ...."
1,Thailand approves $2.2 billion in help for ric...,BANGKOK (Reuters) - Thailand s government on F...,worldnews,"September 1, 2017",1,"[bangkok, <allcaps>, (, reuters, ), -, thailan..."
2,White House confident it will 'prevail' on tra...,ON BOARD AIR FORCE ONE (Reuters) - The White H...,politicsNews,"February 6, 2017",1,"[on, <allcaps>, board, <allcaps>, air, <allcap..."
3,Republican establishment bails on Alabama cand...,(Reuters) - The national campaign wing for U.S...,politicsNews,"November 10, 2017",1,"[(, reuters, ), -, the, national, campaign, wi..."
4,Trump’s Administration Was Just Caught Up In ...,While it can be said that no one is immune fro...,News,"April 5, 2017",0,"[while, it, can, be, said, that, no, one, is, ..."


In [12]:
dev_set.head(5)

Unnamed: 0,title,text,subject,date,target,text_tokcan
0,OBAMA TRIES TO CONDEMN TRUMP SUPPORTERS But Ge...,How dare Obama the worst president and most di...,politics,"Mar 15, 2016",0,"[how, dare, obama, the, worst, president, and,..."
1,Israeli ambassador backs Trump pledge to move ...,WASHINGTON (Reuters) - Israel’s ambassador to ...,politicsNews,"December 21, 2016",1,"[washington, <allcaps>, (, reuters, ), -, isra..."
2,BREAKING NEWS: RAND PAUL Has “Suspended” His C...,Sen. Rand Paul of Kentucky announced Wednesday...,politics,"Feb 3, 2016",0,"[sen, ., rand, paul, of, kentucky, announced, ..."
3,Texas Newspapers Humiliate Ted Cruz By Endors...,Ted Cruz is so hated that even his home state ...,News,"February 15, 2016",0,"[ted, cruz, is, so, hated, that, even, his, ho..."
4,NEW DOCUMENTS: Hillary And State Department Ai...,Hillary Clinton and her aides must have felt l...,Government News,"Aug 19, 2016",0,"[hillary, clinton, and, her, aides, must, have..."


In [13]:
test_set.head(5)

Unnamed: 0,title,text,subject,date,target,text_tokcan
0,"Dad Of Murdered Reporter Hits Trump, GOP BEAU...",The father of a Virginia reporter who was shot...,News,"August 27, 2016",0,"[the, father, of, a, virginia, reporter, who, ..."
1,Vote recount effort races forward despite Trum...,WASHINGTON (Reuters) - Donald Trump’s transiti...,politicsNews,"November 28, 2016",1,"[washington, <allcaps>, (, reuters, ), -, dona..."
2,Boiler Room EP #119 – Zombie Disneyland & The ...,Tune in to the Alternate Current Radio Network...,US_News,"July 29, 2017",0,"[tune, in, to, the, alternate, current, radio,..."
3,Trump National Security Pick Monica Crowley’s...,Conservative columnist Monica Crowley is set f...,News,"January 10, 2017",0,"[conservative, columnist, monica, crowley, is,..."
4,Stock futures dip after North Korea nuclear test,NEW YORK (Reuters) - U.S. equity index futures...,worldnews,"September 3, 2017",1,"[new, <allcaps>, york, <allcaps>, (, reuters, ..."


## Baseline Model: Naive Bayes Classifier

### Classify full text

In [76]:
##
from sklearn.naive_bayes import BernoulliNB
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
#from sklearn.grid_search import GridSearchCV   # THIS HAS BEEN DEPRECATED
from sklearn.model_selection import GridSearchCV
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


train_data, train_labels = train_set.text.values, train_set.target.values
dev_data, dev_labels = dev_set.text.values, dev_set.target.values

train_labels = train_labels.astype(int)
dev_labels = dev_labels.astype(int)

print('train_data shape:', train_data.shape)
#print(train_data[0].shape)
print(train_data[:1])
print('\ntrain_labels shape:', train_labels.shape)
print(train_labels)
print(type(train_labels[0]))
#train_labels.head()
#dev_data.head()
#dev_labels.head()


train_data shape: (31428,)
['WASHINGTON (Reuters) - U.S. President Donald Trump intends to nominate a Raytheon Co (RTN.N) lobbyist, Mark Esper, for the position of secretary of the Army, the White House said on Wednesday. The position has been challenging for Trump to fill. Two previous nominees withdrew their names from consideration. Before Esper joined U.S. missile maker Raytheon in 2010 as vice president government relations he held posts at industry advocacy groups like the Aerospace Industries Association and the U.S. Chamber of Commerce. Esper graduated from the United States Military Academy at West Point, retired from the U.S. Army as a lieutenant colonel and is a veteran of the Gulf War, according to a Raytheon memo announcing his hiring. ']

train_labels shape: (31428,)
[1 1 1 ... 1 0 0]
<class 'numpy.int64'>


In [77]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_data)
X

<31428x104692 sparse matrix of type '<class 'numpy.int64'>'
	with 6562980 stored elements in Compressed Sparse Row format>

In [78]:
#print(X[0])

In [79]:
print('X.shape:', X.shape) # (). There are x documents (rows) in the corpus, with y features (unique words = vocabulary)
print('Vocabulary size (number of features or columns):', X.shape[1])  # 
print('Non-zero elements in matrix (X.nnz):', X.nnz)   # This indicates that there are z non-zero elements in the matrix.
print('Average number of non-zero features per example (per document): %.3f' %(X.nnz/X.shape[0]))  # non-zero elements in matrix / documents = xxx
print('Fraction of non-zero elements in matrix: %.4f' %( X.nnz/(X.shape[0] * X.shape[1])) )   # Fraction of entries in the matrix that are non-zero = X.nnz/(rows*columns) = 0.xxx 


# What are the 0th and last feature strings (in alphabetical order)?
print('0th feature string:', vectorizer.get_feature_names()[0])   # 
print('last feature string:', vectorizer.get_feature_names()[X.shape[1]-1])    # 

X.shape: (31428, 104692)
Vocabulary size (number of features or columns): 104692
Non-zero elements in matrix (X.nnz): 6562980
Average number of non-zero features per example (per document): 208.826
Fraction of non-zero elements in matrix: 0.0020
0th feature string: 00
last feature string: zzzzaaaacccchhh


In [80]:
# Using the standard CountVectorizer, what fraction of the words in the dev data are missing from the vocabulary? 
vectorizer_dev = CountVectorizer()
X_dev = vectorizer_dev.fit_transform(dev_data)  # Independently build a vocabulary using dev_data.
print('X_dev.shape:', X_dev.shape)
print('Vocabulary using train data:', X.shape[1])  # 
print('Vocabulary using dev data:', X_dev.shape[1])                # 

# Feed dev_data into the vectorizer fit using training data.
X_dev_transformed = vectorizer.transform(dev_data)
print('X_dev_transformed shape:', X_dev_transformed.shape)
#print('X_dev_transformed.shape', X_dev_transformed.shape)  # (676, 26,879)  EXPECT .shape[1] equal to original number of features
#print('non-zero indices in X_dev_transformed:', X_dev_transformed.nonzero())  # could also use this to check which features missing...

''' This is way too slow; use set intersection instead!!
# Look at each feature (vocabulary word) in X_dev and see if it is a feature in X.
count = 0
for i in range(X_dev.shape[1]):
    if vectorizer_dev.get_feature_names()[i] in vectorizer.get_feature_names():
        count += 1
print('Count of words (features) in X_dev also in X:', count)   
print('Fraction of words in dev data missing from training vocabulary: %.3f' %((X_dev.shape[1] - count)/X_dev.shape[1]) )
count of words (features) in X_dev also in X: 12219
Fraction of words in dev data missing from training vocabulary: 0.248
'''

set1 = set(vectorizer_dev.get_feature_names())
set2 = set(vectorizer.get_feature_names())
print('Count of words (features) in X_dev also in X:', len(set1.intersection(set2)))
print('Fraction of words in dev data missing from training vocabulary: %.3f' %((X_dev.shape[1] - len(set1.intersection(set2)))/X_dev.shape[1]) )

X_dev.shape: (6735, 53996)
Vocabulary using train data: 104692
Vocabulary using dev data: 53996
X_dev_transformed shape: (6735, 104692)
Count of words (features) in X_dev also in X: 44685
Fraction of words in dev data missing from training vocabulary: 0.172


In [81]:
# BernoulliNB
print('BernoulliNB')
alpha = 1
clf = BernoulliNB(alpha=alpha)
clf.fit(X, train_labels)

print('accuracy: %3.2f' %clf.score(X_dev_transformed, dev_labels))

BernoulliNB
accuracy: 0.94


In [None]:
'''
# Fit a Multinomial Naive Bayes model and find the optimal value for alpha

cv_params = {'alpha': [1E-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
mnb = GridSearchCV(estimator=MultinomialNB(), param_grid=cv_params, scoring='f1_weighted', cv=10, n_jobs=-1)
mnb.fit(X, train_labels)  

print('\nMultinomial Naive Bayes best GridSearchCV results:')
print('Best params:', mnb.best_params_)
print('Best score: %.3f' %(mnb.best_score_))
#print('Best estimator: \n', mnb.best_estimator_)

mnb_dev_predicted_labels = mnb.predict(X_dev_transformed)  # "predict" and report accuracy using dev set
print('f1 score of dev predicted labels using Multinominal Naive Bayes: %.3f' %(metrics.f1_score(dev_labels, mnb_dev_predicted_labels, average='weighted')))
#print('classification report of dev predicted labels: \n', classification_report(dev_labels, mnb_dev_predicted_labels))
print()
'''

In [72]:
'''
Convert a collection of raw documents to a matrix of TF-IDF features.
Equivalent to CountVectorizer followed by TfidfTransformer.

In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little 
meaningful information about the actual contents of the document. If we were to feed the direct count data directly to 
a classifier those very frequent terms would shadow the frequencies of rarer yet more interesting terms.

In order to re-weight the count features into floating point values suitable for usage by a classifier it is very 
common to use the tf–idf transform.

Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency: 
\text{tf-idf(t,d)}=\text{tf(t,d)} \times \text{idf(t)}.
'''

t_vectorizer = TfidfVectorizer()
t_X = t_vectorizer.fit_transform(train_data)   
#print(t_X.shape)
t_X_dev = t_vectorizer.transform(dev_data)
#print(t_X_dev.shape)


# BernoulliNB
print('BernoulliNB with TfidfVectorizer')
alpha = 1
t_clf = BernoulliNB(alpha=alpha)
t_clf.fit(t_X, train_labels)

print('accuracy: %3.2f' %t_clf.score(t_X_dev, dev_labels))

t_dev_predicted_labels = t_clf.predict(t_X_dev)  # "predict" and report accuracy using dev set
#print(t_dev_predicted_labels.shape)

print('\nf1 score of dev predicted labels:', metrics.f1_score(dev_labels, t_dev_predicted_labels, average='weighted'))
print('classification report of dev predicted labels: \n', classification_report(dev_labels, t_dev_predicted_labels))
print()

BernoulliNB with TfidfVectorizer
accuracy: 0.94

f1 score of dev predicted labels: 0.9419881828992114
classification report of dev predicted labels: 
               precision    recall  f1-score   support

           0       0.97      0.92      0.94      3552
           1       0.92      0.96      0.94      3183

   micro avg       0.94      0.94      0.94      6735
   macro avg       0.94      0.94      0.94      6735
weighted avg       0.94      0.94      0.94      6735




NOTE: Same results (accuracy=0.94) for both the default CountVectorizer and TfidfVectorizer.  Using full text means all TRUE news contains the word "Reuters", which is an unfair advantage.  Will try to remove those and run again, expecting lower accuracy.  
Should also account for text starting with: "'The following statements\xa0were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own.\xa0Reuters has not edited the statements or confirmed their accuracy."

### Repeat Naive Bayes on text field after removing first chunk of text, including "Reuters"

In [20]:
true_data.head()

Unnamed: 0,title,text,subject,date,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [25]:
true_data.iloc[0,1][22:]

' The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support education, scientific resear

In [32]:
true_data.iloc[:,1]

0        WASHINGTON (Reuters) - The head of a conservat...
1        WASHINGTON (Reuters) - Transgender people will...
2        WASHINGTON (Reuters) - The special counsel inv...
3        WASHINGTON (Reuters) - Trump campaign adviser ...
4        SEATTLE/WASHINGTON (Reuters) - President Donal...
5        WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...
6        WEST PALM BEACH, Fla (Reuters) - President Don...
7        The following statements were posted to the ve...
8        The following statements were posted to the ve...
9        WASHINGTON (Reuters) - Alabama Secretary of St...
10       (Reuters) - Alabama officials on Thursday cert...
11       NEW YORK/WASHINGTON (Reuters) - The new U.S. t...
12       The following statements were posted to the ve...
13       The following statements were posted to the ve...
14        (In Dec. 25 story, in second paragraph, corre...
15       (Reuters) - A lottery drawing to settle a tied...
16       WASHINGTON (Reuters) - A Georgian-American bus.

In [29]:
true_data2 = true_data.iloc[:,1][22:]

In [30]:
true_data2.head()

22    (Reuters) - A U.S. appeals court on Friday sai...
23    WASHINGTON (Reuters) - A federal appeals court...
24    LIMA (Reuters) - Peru’s President Pedro Pablo ...
25    WASHINGTON (Reuters) - U.S. President Donald T...
26    WASHINGTON (Reuters) - U.S. financial regulato...
Name: text, dtype: object

#### NOTE: Didn't do what was intended.  Will need to step through each line one at a time, look for "Reuters" and "The following statements..." and remove these...

In [None]:
#append the datasets and shuffle them
all_data2 = true_data2.append(fake_data, ignore_index=True)
all_data2 = all_data2.sample(frac=1).reset_index(drop=True)

all_data2.describe()

### Run Naive Bayes on the Title field

In [90]:
fake_data.title

0         Donald Trump Sends Out Embarrassing New Year’...
1         Drunk Bragging Trump Staffer Started Russian ...
2         Sheriff David Clarke Becomes An Internet Joke...
3         Trump Is So Obsessed He Even Has Obama’s Name...
4         Pope Francis Just Called Out Donald Trump Dur...
5         Racist Alabama Cops Brutalize Black Boy While...
6         Fresh Off The Golf Course, Trump Lashes Out A...
7         Trump Said Some INSANELY Racist Stuff Inside ...
8         Former CIA Director Slams Trump Over UN Bully...
9         WATCH: Brand-New Pro-Trump Ad Features So Muc...
10        Papa John’s Founder Retires, Figures Out Raci...
11        WATCH: Paul Ryan Just Told Us He Doesn’t Care...
12        Bad News For Trump — Mitch McConnell Says No ...
13        WATCH: Lindsey Graham Trashes Media For Portr...
14        Heiress To Disney Empire Knows GOP Scammed Us...
15        Tone Deaf Trump: Congrats Rep. Scalise On Los...
16        The Internet Brutally Mocks Disney’s New Trum.

In [82]:
train_data, train_labels = train_set.title.values, train_set.target.values
dev_data, dev_labels = dev_set.title.values, dev_set.target.values

train_labels = train_labels.astype(int)
dev_labels = dev_labels.astype(int)

#train_data.head()
print('train_data shape:', train_data.shape)
#print(train_data[0].shape)
print(train_data[:1])
print('\ntrain_labels shape:', train_labels.shape)
print(train_labels)


train_data shape: (31428,)
['Trump to nominate Raytheon lobbyist for Army secretary']

train_labels shape: (31428,)
[1 1 1 ... 1 0 0]


In [83]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_data)


print('X.shape:', X.shape) # (). There are x documents (rows) in the corpus, with y features (unique words = vocabulary)
print('Vocabulary size (number of features or columns):', X.shape[1])  # 
print('Non-zero elements in matrix (X.nnz):', X.nnz)   # This indicates that there are z non-zero elements in the matrix.
print('Average number of non-zero features per example (per document): %.3f' %(X.nnz/X.shape[0]))  # non-zero elements in matrix / documents = xxx
print('Fraction of non-zero elements in matrix: %.4f' %( X.nnz/(X.shape[0] * X.shape[1])) )   # Fraction of entries in the matrix that are non-zero = X.nnz/(rows*columns) = 0.xxx 


X.shape: (31428, 18850)
Vocabulary size (number of features or columns): 18850
Non-zero elements in matrix (X.nnz): 382587
Average number of non-zero features per example (per document): 12.173
Fraction of non-zero elements in matrix: 0.0006


In [84]:
# Using the standard CountVectorizer, what fraction of the words in the dev data are missing from the vocabulary? 
vectorizer_dev = CountVectorizer()
X_dev = vectorizer_dev.fit_transform(dev_data)  # Independently build a vocabulary using dev_data.
print('X_dev.shape:', X_dev.shape)
print('Vocabulary using train data:', X.shape[1])  # 
print('Vocabulary using dev data:', X_dev.shape[1])                # 

# Feed dev_data into the vectorizer fit using training data.
X_dev_transformed = vectorizer.transform(dev_data)
print('X_dev_transformed shape:', X_dev_transformed.shape)
#print('X_dev_transformed.shape', X_dev_transformed.shape)  # ()  EXPECT .shape[1] equal to original number of features
#print('non-zero indices in X_dev_transformed:', X_dev_transformed.nonzero())  # could also use this to check which features missing...

''' This is way too slow; use set intersection instead!!
# Look at each feature (vocabulary word) in X_dev and see if it is a feature in X.
count = 0
for i in range(X_dev.shape[1]):
    if vectorizer_dev.get_feature_names()[i] in vectorizer.get_feature_names():
        count += 1
print('Count of words (features) in X_dev also in X:', count)   
print('Fraction of words in dev data missing from training vocabulary: %.3f' %((X_dev.shape[1] - count)/X_dev.shape[1]) )
count of words (features) in X_dev also in X: 12219
Fraction of words in dev data missing from training vocabulary: 0.248
'''

set1 = set(vectorizer_dev.get_feature_names())
set2 = set(vectorizer.get_feature_names())
print('Count of words (features) in X_dev also in X:', len(set1.intersection(set2)))
print('Fraction of words in dev data missing from training vocabulary: %.3f' %((X_dev.shape[1] - len(set1.intersection(set2)))/X_dev.shape[1]) )

X_dev.shape: (6735, 10434)
Vocabulary using train data: 18850
Vocabulary using dev data: 10434
X_dev_transformed shape: (6735, 18850)
Count of words (features) in X_dev also in X: 9303
Fraction of words in dev data missing from training vocabulary: 0.108


In [85]:
# BernoulliNB
print('BernoulliNB')
alpha = 1
clf = BernoulliNB(alpha=alpha)
clf.fit(X, train_labels)

print('accuracy: %3.2f' %clf.score(X_dev_transformed, dev_labels))

BernoulliNB
accuracy: 0.96


In [86]:
#print('title, target label\n', train_set.title, train_set.target)
print('title, target label\n', train_set.title[4], train_set.target[4])

title, target label
  Trump’s Administration Was Just Caught Up In Prostitution Scandal; Here’s How It Went Down (DETAILS) 0


In [87]:
type(train_set.target)

pandas.core.series.Series

In [88]:
type(train_labels)

numpy.ndarray

### Sandbox

delete eveything below when notebook complete

In [76]:
#df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
#df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))



all_data.iloc[1]['text']



'NEW DELHI (Reuters) - Donald Trump sympathizes with India in its recent escalation of tensions with Pakistan and supports skilled immigration, an adviser said on Friday, portraying the U.S. presidential hopeful as a friend of India and Indian Americans. Trump, a real estate billionaire, has earned a reputation of hostility toward minorities with proposals such as “extreme vetting” of potential immigrants and building a wall along the Mexican border to stop illegal immigration.  The Republican nominee has proposed a ban on immigration from countries where vetting would be difficult, such as nations faced with Islamic militancy. Some Indian officials worry the United States could become more isolationist under Trump, leaving allies like New Delhi without the support it has enjoyed under President Barack Obama against China’s growing regional influence.      Shalabh Kumar, a Chicago-based businessman of Indian origin tasked by the Trump campaign with reaching out to Asian-Americans, said

In [52]:
#re.split(r'([^\w<>])', teststring)
list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in test1]))

['Detected',
 ' ',
 'text',
 ' ',
 'input',
 ' ',
 'format',
 'My',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9984',
 'name',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '9996',
 'is',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9973',
 'Abhishek',
 '\t',
 '',
 '^',
 '',
 '\t',
 '0',
 '.',
 '9628',
 '',
 '.',
 '',
 '\t',
 '',
 ',',
 '',
 '\t',
 '0',
 '.',
 '9975',
 'I',
 '\t',
 'O',
 '\t',
 '0',
 '.',
 '9980',
 'have',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9999',
 'no',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9911',
 'clue',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '9998',
 '',
 '.',
 '',
 '\t',
 '',
 ',',
 '',
 '\t',
 '0',
 '.',
 '9985',
 'Learning',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9957',
 'the',
 '\t',
 'D',
 '\t',
 '0',
 '.',
 '9960',
 'back',
 '-',
 'portion',
 '\t',
 'N',
 '\t',
 '0',
 '.',
 '8394',
 'that',
 '\t',
 'P',
 '\t',
 '0',
 '.',
 '9530',
 'I',
 '\t',
 'O',
 '\t',
 '0',
 '.',
 '9989',
 'never',
 '\t',
 'R',
 '\t',
 '0',
 '.',
 '9922',
 'cared',
 '\t',
 'V',
 '\t',
 '0',
 '.',
 '9976',
 'for',
 '\t',
 'P',
 '\t',
 '

In [109]:
test1_list = list([re.split(r'([\t])',x) for x in test1])
test1_list = [[ item for item in word if item != '\t' ] for word in test1_list]


[['My', 'D', '0.9984'],
 ['name', 'N', '0.9996'],
 ['is', 'V', '0.9973'],
 ['Abhishek', '^', '0.9628'],
 ['.', ',', '0.9975'],
 ['I', 'O', '0.9980'],
 ['have', 'V', '0.9999'],
 ['no', 'D', '0.9911'],
 ['clue', 'N', '0.9998'],
 ['.', ',', '0.9985'],
 ['Learning', 'V', '0.9957'],
 ['the', 'D', '0.9960'],
 ['back-portion', 'N', '0.8394'],
 ['that', 'P', '0.9530'],
 ['I', 'O', '0.9989'],
 ['never', 'R', '0.9922'],
 ['cared', 'V', '0.9976'],
 ['for', 'P', '0.9806'],
 ['.', ',', '0.9916'],
 ["Obama's", 'Z', '0.8890'],
 ['nephew', 'N', '0.9582'],
 ['.', ',', '0.9976'],
 ['@random', '@', '0.9960']]

In [110]:
pdtest2 = pd.DataFrame(test1_list[1:-2], columns = ['word','tag','confidence'] )
pdtest2

Unnamed: 0,word,tag,confidence
0,My,D,0.9984
1,name,N,0.9996
2,is,V,0.9973
3,Abhishek,^,0.9628
4,.,",",0.9975
5,I,O,0.998
6,have,V,0.9999
7,no,D,0.9911
8,clue,N,0.9998
9,.,",",0.9985
