In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

import scipy.optimize
import random
from collections import defaultdict
import string
from nltk.stem.porter import *
from sklearn import linear_model
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import requests

from bs4 import BeautifulSoup
import time
from random import randint

import json
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import tree


In [2]:
data_full = pd.DataFrame.from_csv('processed_data.csv',sep=",")
data, data_test = train_test_split(data_full, test_size=0.2)
data_full.head()

Unnamed: 0,label,statement,score
1,half-true,When did the decline of coal start? It started...,0
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",1
3,false,Health care reform legislation is likely to ma...,-1
4,half-true,The economic turnaround started at the end of ...,0
5,true,The Chicago Bears have had more starting quart...,1


## Unigrams and Bigrams

In [6]:
stopwordlist = stopwords.words("english")
punctuation = set(string.punctuation)

# split training data to 3 classes
d_true = data[data.score==1]
d_false = data[data.score==-1]
d_amb = data[data.score==0]

unigrams = dict()
unigrams['true'] = defaultdict(int)
unigrams['false'] = defaultdict(int)
unigrams['amb'] = defaultdict(int)
unigrams['all'] = defaultdict(int)
bigrams = dict()
bigrams['true'] = defaultdict(int)
bigrams['false'] = defaultdict(int)
bigrams['amb'] = defaultdict(int)
bigrams['all'] = defaultdict(int)

# finding frequent unigrams and bigrams for the three classes
for d1,d2 in zip(data.score, data.statement):
    r = ''.join([c for c in d2.lower() if not c in punctuation])
    w1 = None       
    for w in r.split():
        if w not in stopwordlist :
            if d1 == 1 :
                unigrams['true'][w] += 1
                if w1 :
                    bigrams['true'][w1+' '+w] += 1
            if d1 == -1 :
                unigrams['false'][w] += 1
                if w1 :
                    bigrams['false'][w1+' '+w] += 1
            if d1 == 0 :
                unigrams['amb'][w] += 1
                if w1 :
                    bigrams['amb'][w1+' '+w] += 1    
            w1 = w
            
def freq_words(dict1, dict2, n=1000) : ## n = Number of unigrams and bigrams (hyperparameter)
    words = [(dict1[w], w) for w in dict1 if w.isdigit() == False]
    words.sort(reverse = True)
    words = [x[1] for x in words[:n]]
    bi = [(dict2[w], w) for w in dict2]
    bi.sort(reverse = True)
    bi = [x[1] for x in bi[:n]]
    comb = words + bi
    comb_ID = dict(zip(comb, range(len( comb))))
    return comb,comb_ID, words

# combination of bigrams and unigrams
comb_true, comb_true_id, words_true = freq_words( unigrams['true'], bigrams['true'] )
comb_false, comb_false_id, words_false = freq_words( unigrams['false'], bigrams['false'] )
comb_amb, comb_amb_id, words_amb = freq_words( unigrams['amb'], bigrams['amb'] )

# combination of bigrams and unigrams from all three classes
comb_full = list( set( comb_true + comb_false + comb_amb ) )
comb_full_id = dict(zip(comb_full, range(len( comb_full ))))

# combination of frequent unigrams from the three classes
words_full = list( set( words_true + words_false + words_amb ))
words_full_id = dict(zip(words_full, range(len( words_full ))))            

def feature(datum, comb, comb_id):
    feat = [0]*len(comb)
    temp = []
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    w1 = None
    for w in r.split():
        temp.append(w)
        if w in comb:
            feat[comb_id[w]] += 1
        if w1 :
            bi = str( w1+' '+w )
            if bi in comb :
                feat[comb_id[bi]] += 1
        w1 = w
    feat.append(1)
    return feat

# features, using entire datasets (to get + and- samples)
# using unigrams and bigrams of true, false and amb cases respectively
X_ngram_true = np.array( [feature(d, comb_true, comb_true_id ) for d in data.statement ]  )
X_ngram_false = np.array( [feature(d, comb_false, comb_false_id ) for d in data.statement ] )
X_ngram_amb = np.array( [feature(d, comb_amb, comb_amb_id ) for d in data.statement ] )
X_ngram_full = np.array( [feature(d, comb_full, comb_full_id ) for d in data.statement ] )

X_ngram_true_test = np.array( [feature(d, comb_true, comb_true_id ) for d in data_test.statement ] )
X_ngram_false_test = np.array( [feature(d, comb_false, comb_false_id ) for d in data_test.statement ] )
X_ngram_amb_test = np.array( [feature(d, comb_amb, comb_amb_id ) for d in data_test.statement ] )

y_ngram_true = np.array([d==1 for d in data['score']]) 
y_ngram_false = np.array([d==-1 for d in data['score']])
y_ngram_amb = np.array([d==0 for d in data['score']])
y_ngram_full = np.array( [d for d in data['score']] )

In [4]:
def accu2(y_test,pred1) :
    accu = 100*np.sum(pred1 == y_test)/len(pred1)
    f_f = 100*np.sum( (pred1 == -1) & (y_test == -1))/np.sum( y_test == -1) # pred fake, label fake
    f_t = 100*np.sum( (pred1 == 1) & (y_test == -1))/np.sum( y_test == -1)
    t_t = 100*np.sum( (pred1 == 1) & (y_test == 1))/np.sum( y_test == 1)
    t_f = 100*np.sum( (pred1 == -1) & (y_test == 1))/np.sum( y_test == 1)
    ber = 1 - 0.5*( (t_t/(t_t+t_f)) + (f_f/(f_f+f_t)) )
    print('Accuracy for testing = ', accu,'%')
    print('BER = ', ber)
    return accu, ber

## Logistic regression using 500 unigram/bigrams alone

In [5]:
def log_reg(X_true, y_true, lam = 1.0) :
    clf_true = LogisticRegression()
    clf_true.fit(X_true, y_true)
    theta_true = clf_true.coef_
    pred_true = clf_true.predict(X_true)
    pred_true = np.array([x >0.5 for x in pred_true])
    accu = np.sum( (y_true==pred_true) )/len(data) 
    return clf_true, pred_true, accu 

clf_true, pred_true, accu = log_reg(X_ngram_true, y_ngram_true ) 
# print('accuracy on training set for true class = ', accu )
clf_false, pred_false, accu = log_reg(X_ngram_false, y_ngram_false ) 
# print('accuracy on training set for false class = ', accu )
clf_amb, pred_amb, accu = log_reg(X_ngram_amb, y_ngram_amb ) 
# print('accuracy on training set for amb class = ', accu )

# X_ngram_true_test = [feature(d, comb_true, comb_true_id ) for d in data_test.statement ]
# X_ngram_false_test = [feature(d, comb_false, comb_false_id ) for d in data_test.statement ]
# X_ngram_amb_test = [feature(d, comb_amb, comb_amb_id ) for d in data_test.statement ]

y_ngram_true_test = [d==1 for d in data_test['score']]
y_ngram_false_test = [d==-1 for d in data_test['score']]
y_ngram_amb_test = [d==0 for d in data_test['score']]

pred_true_test = list( clf_true.predict(X_ngram_true_test) )
pred_false_test = list( clf_false.predict(X_ngram_false_test) )
pred_amb_test = list( clf_amb.predict(X_ngram_amb_test) )

pred_tests = [pred_false_test,pred_amb_test, pred_true_test]
y_test = np.array(data_test['score'])
pred_test = np.argmax(pred_tests,axis=0)-1
def accu(y_test,pred1) :
    accu = 100*np.sum(pred1 == y_test)/len(pred1)
    f_f = 100*np.sum( (pred1 == -1) & (y_test == -1))/np.sum( y_test == -1) # pred fake, label fake
    f_t = 100*np.sum( (pred1 == 1) & (y_test == -1))/np.sum( y_test == -1)
    t_t = 100*np.sum( (pred1 == 1) & (y_test == 1))/np.sum( y_test == 1)
    t_f = 100*np.sum( (pred1 == -1) & (y_test == 1))/np.sum( y_test == 1)
    ber = 1 - 0.5*( (t_t/(t_t+t_f)) + (f_f/(f_f+f_t)) )
    print('Accuracy for testing = ', accu,'%')
    print('Fake news predicted as Fake = ', f_f,"%")
    print('Fake news mispredicted as True = ', f_t,"%")
    print('True news predicted as True = ', t_t,"%")
    print('True news mispredicted as Fake = ', t_f,"%")
    print('BER = ', ber)

accu(y_test,pred_test) 


NameError: name 'y_ngram_true' is not defined

In [161]:

pred_true_test = list( clf_true.predict(X_ngram_true_test) )
pred_false_test = list( clf_false.predict(X_ngram_false_test) )
pred_amb_test = list( clf_amb.predict(X_ngram_amb_test) )

pred_tests = [pred_false_test,pred_amb_test, pred_true_test]
y_test = np.array(data_test['score'])
pred_test = np.argmax(pred_tests,axis=0)-1
accu(y_test,pred_test) 

Accuracy for testing =  81.2208760485 %
Fake news predicted as Fake =  42.5688661115 %
Fake news mispredicted as True =  53.7796284433 %
True news predicted as True =  97.4550761653 %
True news mispredicted as Fake =  1.25877953115 %
BER =  0.285464996687


## TFIDF

In [226]:
n_components = 30
n_words = 30
n_features = 1000
d_true = data[data.score==1]
d_false = data[data.score==-1]
d_amb = data[data.score==0]

def TFIDF_vec(data, n_features):
# Use tf-idf features.
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=n_features,stop_words='english')
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=n_features,stop_words='english') 
    tf = tf_vectorizer.fit_transform(data)
    tfidf = tfidf_vectorizer.fit_transform(data)
    tfidf = tfidf.toarray()
    tf = tf.toarray()
    idf = tfidf_vectorizer.idf_
    words = tfidf_vectorizer.get_feature_names()
    word_id = dict(zip(words, range(len( words ))))
    return tf, tfidf, words, idf, word_id

def TF(datum, words,wordID, idf) :
    tf = [0]*len(words)
    for i in words :
        s = datum.lower().split()
        tf[wordID[i]] = s.count(i)
    return tf

def TFIDF(datum, words,wordID, idf) :
    tf = [0]*len(words)
    for i in words :
        s = datum.lower().split()
        tf[wordID[i]] = s.count(i) * idf[wordID[i]]
    return tf

tf_true, tfidf_true,words_true, idf_true, words_true_id = TFIDF_vec(d_true['statement'], n_features)
tf_amb, tfidf_amb,words_amb, idf_amb, words_amb_id = TFIDF_vec(d_amb['statement'], n_features)
tf_false, tfidf_false,words_false, idf_false, words_false_id = TFIDF_vec(d_false['statement'], n_features)



In [245]:
def features_TFIDF(datum) :
    tf1 =  np.array( TFIDF(datum,words_true,words_true_id,idf_true)  ).reshape(1,len(words_true))
    tf2 =  np.array( TFIDF(datum,words_false,words_false_id,idf_false)  ).reshape(1,len(words_true))
    tf3 =  np.array( TFIDF(datum,words_amb,words_amb_id,idf_amb)  ).reshape(1,len(words_true))
    return np.column_stack((tf1,tf2,tf3))

X_TFIDF = [features_TFIDF(datum ) for datum in data.statement]
X_TFIDF = np.array( X_TFIDF ).reshape(len(data),len(words_true)*3)
X_TFIDF_test = [features_TFIDF(datum ) for datum in data_test.statement]
X_TFIDF_test = np.array( X_TFIDF_test ).reshape(len(data_test),len(words_true)*3)  

X_ngram = np.column_stack((X_ngram_true,X_ngram_false,X_ngram_amb))
X_ngram_test  = np.column_stack((X_ngram_true_test ,X_ngram_false_test ,X_ngram_amb_test))
X_ngram_val = X_ngram_test[:5000,:]
X_ngram_test = X_ngram_test[5000:,:]
X_TFIDF_val = X_TFIDF_test[:5000,:]
X_TFIDF_test = X_TFIDF_test[5000:,:]

## NMF and LDA Feature extraction

In [254]:
nmf_true = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5)
nmf_false = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5)
nmf_amb = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5)

nmf_amb.fit(tf_amb)
nmf_false.fit(tf_false)
nmf_true.fit(tf_true)

def NMF_features(datum ): 
    tf = TFIDF(datum,words_true,words_true_id,idf_true)
    x1 = nmf_true.transform( np.array(tf).reshape(1,-1) )
    tf = TFIDF(datum,words_false,words_false_id,idf_false)
    x2 = nmf_false.transform( np.array(tf).reshape(1,-1) )
    tf = TFIDF(datum,words_amb,words_amb_id,idf_amb)
    x3 = nmf_amb.transform( np.array(tf).reshape(1,-1) )
    return np.column_stack((x1,x2,x3))

X_NMF = [NMF_features(datum ) for datum in data.statement]
X_NMF = np.array( X_NMF ).reshape(len(data),n_components*3)
X_NMF_test = [NMF_features(datum ) for datum in data_test.statement]
X_NMF_test = np.array( X_NMF_test ).reshape(len(data_test),n_components*3)

lda_true = LatentDirichletAllocation(n_topics=n_components, max_iter=5, 
                                    learning_method='online', learning_offset=50.,random_state=0)
lda_false = LatentDirichletAllocation(n_topics=n_components, max_iter=5, 
                                    learning_method='online', learning_offset=50.,random_state=0)
lda_amb = LatentDirichletAllocation(n_topics=n_components, max_iter=5, 
                                    learning_method='online', learning_offset=50.,random_state=0)
lda_amb.fit(tf_amb)
lda_false.fit(tf_false)
lda_true.fit(tf_true)

def LDA_features(datum ): 
    tf = TF(datum,words_true,words_true_id,idf_true)
    x1 = lda_true.transform( np.array(tf).reshape(1,-1) )
    tf = TF(datum,words_false,words_false_id,idf_false)
    x2 = lda_false.transform( np.array(tf).reshape(1,-1) )
    tf = TF(datum,words_amb,words_amb_id,idf_amb)
    x3 = lda_amb.transform( np.array(tf).reshape(1,-1) )
    return np.column_stack((x1,x2,x3))

X_LDA = [LDA_features(datum ) for datum in data.statement]
X_LDA = np.array( X_LDA ).reshape(len(data),n_components*3)
X_LDA_test = [LDA_features(datum ) for datum in data_test.statement]
X_LDA_test = np.array( X_LDA_test ).reshape(len(data_test),n_components*3)

## Linear SVM 

In [257]:
X_ngram = np.column_stack((X_ngram_true,X_ngram_false,X_ngram_amb ))
X_ngram_test  = np.column_stack((X_ngram_true_test ,X_ngram_false_test ,X_ngram_amb_test ))

X = np.column_stack((X_LDA,X_NMF,X_ngram))
X_test = np.column_stack((X_LDA_test,X_NMF_test,X_ngram_test))
X.shape

y = np.array( [d for d in data['score']  ] )
y_test = np.array( [d for d in data_test['score']] )

X_ngram = np.column_stack((X_ngram_true,X_ngram_false,X_ngram_amb ))
X_ngram_test  = np.column_stack((X_ngram_true_test ,X_ngram_false_test ,X_ngram_amb_test ))
X = np.column_stack((X_LDA,X_NMF,X_ngram))
X_test = np.column_stack((X_LDA_test,X_NMF_test,X_ngram_test))

# X = np.column_stack((X_TFX_ngram))
# X_test = np.column_stack((X_TFIDF_test,X_ngram_test))

clf = LinearSVC(random_state=0)
clf.fit(X, y)
pred1 = clf.predict(X_test)
accu(y_test,pred1) 

(60085, 6183)

## Logisitc regression 

In [151]:
y = y1
y_test = y_test1

X = np.column_stack((X_LDA,X_NMF,X_ngram,X_TFIDF))
X_test = np.column_stack((X_LDA_test,X_NMF_test,X_ngram_test,X_TFIDF_test))

clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
pred1= clf.predict(X_test)
for i in range(len(pred1)) :
    if pred1[i]<-0.8:
        pred1[i] = -1
    else:
        pred1[i] = 1 
accu(y_test,pred1) 

Accuracy for testing =  82.1927839169 %
Fake news predicted as Fake =  61.1467008328 %
Fake news mispredicted as True =  38.8532991672 %
True news predicted as True =  95.2111648271 %
True news mispredicted as Fake =  4.78883517285 %
BER =  0.181219810364


## RandomForest 

In [342]:
y = np.array( [d for d in data['score']  ] )
y_test = np.array( [d for d in data_test['score']] )

X = np.column_stack((X_LDA,X_ngram))
X_test = np.column_stack((X_LDA_test,X_ngram_test))

clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(X, y)
predtree = clf_tree.predict(X_test)
accu(y_test,predtree) 


Accuracy for testing =  79.0906670217 %
Fake news predicted as Fake =  58.7271581601 %
Fake news mispredicted as True =  33.1442974165 %
True news predicted as True =  89.3900422561 %
True news mispredicted as Fake =  7.56016902444 %
BER =  0.219374044206


## MLP using ngrams, NMF and LDA

In [None]:
%%time

X = np.column_stack((X_LDA,X_NMF,X_ngram,X_TFIDF))
X_test = np.column_stack((X_LDA_test,X_NMF_test,X_ngram_test,X_TFIDF_test))

y = np.array( [d for d in data['score']  ] )
y_test = np.array( [d for d in data_test['score']] )

clf = MLPClassifier( hidden_layer_sizes=(50,10,3), random_state=1)
clf.fit(X, y)
pred1 = clf.predict(X_test)
accu(y_test,pred1) 

