## NLP Project
## Group-05

## Fake News Generation and Detection Using ANN and Naive Bayes

In [2]:
import numpy
import numpy as np
import pandas as pd
from os import system, name
from IPython.display import clear_output
import string
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
import re

In [3]:
#Open data set from Kaggle: https://www.kaggle.com/jruvika/fake-news-detection[1]
#Overview of the data set
column= ['URLs', 'Headline', 'Body', 'Label']
dataset = pd.read_csv("data.csv",header=None, skiprows=1, names=column)
print(dataset.shape)
dataset.head()

(4009, 4)


Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


## 1. Fake News Generation using ANN

### 1.1 Data preprocessing

In [4]:
# Read in only the two columns we need 
news = pd.read_csv('data.csv', usecols = ['Body', 'Label'])  #Labels: 0 for fake, 1 for true
news['Body'] = news['Body'].astype(str)

#removes noises
final_words = [re.sub('[^0-9a-zA-Z .]+', '', word) for word in news['Body']]
news['Body'] = final_words

#samples
np.savetxt(r'news.txt', news.loc[[7, 12], :].values, fmt='%s')
file_name = "news.txt"
news_text = open(file_name).read()
news_text = news_text.lower()

### 1.2 ANN initialization and training

In [8]:
#creates a mapping of unique chars to integers
unique_chars = sorted(list(set(news_text)))
c_to_i_map = dict((char, intgr) for intgr, char in enumerate(unique_chars))
total_chars = len(news_text)
total_vocab = len(unique_chars)

print("Total Characters: ", total_chars)
print("Total Vocabulary: ", total_vocab)

#prepare the dataset of input to output pairs encoded as integers
X_temp = []
Y_temp = []
sq_len = 100 #splits news_text into subsequences with 100 characters 

for i in range(0, total_chars - sq_len, 1):
    sq_inp = news_text[i:i + sq_len]
    sq_out = news_text[i + sq_len]
    X_temp.append([c_to_i_map[ch] for ch in sq_inp])
    Y_temp.append(c_to_i_map[sq_out])

total_patterns = len(X_temp)
print("Total Patterns: ", total_patterns)

X = numpy.reshape(X_temp, (total_patterns, sq_len, 1)) #converts X to [samples, time steps, features] for LSTM
X = X / float(total_vocab)                             #normalizes X
Y = np_utils.to_categorical(Y_temp)                    #one hot encode y

#LSTM
textgen_model = Sequential()
textgen_model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
textgen_model.add(Dropout(0.2))
textgen_model.add(LSTM(256))
textgen_model.add(Dropout(0.2))
textgen_model.add(Dense(Y.shape[1], activation='softmax'))
textgen_model.compile(loss='categorical_crossentropy', optimizer='adam')

filepath="textgen_nn_weights.hdf5"    #saves file to create checkpoints for each epoch
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print('Please wait...')
textgen_model.fit(X, Y, epochs=100, batch_size=64, callbacks=callbacks_list)

print('ANN saved successfully in textgen_nn_weights.hdf5!')

Total Characters:  6008
Total Vocabulary:  38
Total Patterns:  5908
Please wait...
Epoch 1/1
ANN saved successfully in textgen_nn_weights.hdf5!


### 1.3 News text prediction and generation

In [18]:
######generates 5 news texts
#reloads network weights
print("Please wait...")
filename = "textgen_nn_weights.hdf5"
textgen_model.load_weights(filename)
textgen_model.compile(loss='categorical_crossentropy', optimizer='adam')

text_generated = []
i_to_c_map = dict((intgr, char) for intgr, char in enumerate(unique_chars))

for j in range(0, 5):
    #random index to start
    start_index = numpy.random.randint(0, len(X_temp)-1)
    pattern = X_temp[start_index]
    text_temp = ''.join([i_to_c_map[v] for v in pattern])

    #generates next characters
    new_text = ""
    for i in range(200):
        x = numpy.reshape(pattern, (1, len(pattern), 1))
        x = x / float(total_vocab)
        predict = textgen_model.predict(x, verbose=0)
        indx = numpy.argmax(predict)
        result = i_to_c_map[indx]
        sq_inp = [i_to_c_map[v] for v in pattern]
        new_text += result
        pattern.append(indx)
        pattern = pattern[1:len(pattern)]    

    text_generated.append(text_temp + ' ' + new_text)    

yTest = [0, 1, 0, 1, 0]                                #assigns random classes. 0 for fake. 1 for true
print("\nGenerated News:\n")
print(numpy.array(text_generated))


Please wait...

Generated News:

['to learn in life and in baseball. youve had your 15 minutes of fame and brought a bunch of drama int o the game i love. now who is next because it is coming. the newbie opened up the flood gates.so now that its out there and instead of the news main focus being about the yankees blowing two games aga'
 ' regular season. maxwell said his decision had been coming for a long time and referenced what happe ned in charlottesville. hes had a while to sit and stew on that and just now decides to join the movement and take a knee right at the end of the season. i dont look at that and find that to be brave '
 'ed so hard to avoid in high school. i respect your opinion whatever it may be but i want to see you  play the game and hear about all of the news that comes form that..not from you taking a knee and causing americans everywhere to debate about whether it is right or wrong.that is why i stopped watchi'
 'or late last year.for the full mint interview see 

## 2. Fake News Detection

### 2.1 Fake News Detection using ANN

In [57]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten
from keras.layers import Embedding 
from keras.layers import Input,LSTM,Bidirectional,Activation,Conv1D
from keras.layers import Embedding,GlobalMaxPooling1D, MaxPooling1D, Add
from sklearn.model_selection import train_test_split

### 2.1.1 Data preprocessing

In [58]:
#reads csv file
print("Please wait...")
dataset = pd.read_csv('data.csv', usecols = ['Body', 'Label'])
dataset['Body'] = dataset['Body'].astype('str')
#dataset.head()

#train data and test data seperation
Train_data= dataset.Body
labels = dataset.Label
X_train,X_test,y_train,y_test = train_test_split(Train_data,labels,test_size=0.30,random_state=0)

#tokenization and padding
max_length=100
tok = Tokenizer()
tok.fit_on_texts(X_train) #creates the vocabulary index based on word frequency
vocab_size = len(tok.word_index) + 1
#integer encoding
X_train_encoded = tok.texts_to_sequences(X_train)#takes each word in the text and replaces 
                                                 #it with its corresponding integer value

x_train_padding=pad_sequences(X_train_encoded, maxlen=max_length, padding='post') #pad_sequences is used to ensure that all sequences in a 
                                                                                  #list have the same length
print('Shape of data tensor:', x_train_padding.shape)
print('Shape of label tensor:', y_train.shape)

#encodes and pads X_test
X_test_encoded = tok.texts_to_sequences(X_test)
x_test_padding=sequence.pad_sequences(X_test_encoded, maxlen=max_length,padding='post')

#loads embedding
embeddings_index = dict()
#glove word emedding
f = open('glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

#initializes a matrix with zeros having dimensions equivalent to vocab size and 100
embedding_matrix = zeros((vocab_size, 100))
#iterates over words and indexes in the data
for word, idx_word in tok.word_index.items():
    #gets embedding vectors
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #inserts it in the matrix at the index of the word
        embedding_matrix[idx_word] = embedding_vector
#print (embedding_matrix[1])



Please wait...
Shape of data tensor: (2806, 100)
Shape of label tensor: (2806,)
Loaded 400000 word vectors.


### 2.1.2 ANN initialization and training

In [59]:
#defines NN model
print("\nPlease wait...\n")

nn_model = Sequential()
#this layer can only be used as the first layer in a model
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False) 
nn_model.add(e)                                                                                      
nn_model.add(Flatten())
nn_model.add(Dense(1, activation='sigmoid'))
nn_model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
print(nn_model.summary())

#fits the model
history=nn_model.fit(x_train_padding, y_train, epochs=5, verbose=0, validation_data=(x_test_padding,y_test))
#saves network weights for future use
nn_model.save_weights('nn_detector_weights.hdf5')

#accuracies and losses
acc = history.history['acc']
print ("Accuracy history: ",acc)
val_acc = history.history['val_acc']
print("\nValidation history: ",val_acc)

print('\nANN saved successfully in nn_detector_weights.hdf5!')


Please wait...

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          4748600   
_________________________________________________________________
flatten_5 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 10001     
Total params: 4,758,601
Trainable params: 10,001
Non-trainable params: 4,748,600
_________________________________________________________________
None
Accuracy history:  [0.7947255882380778, 0.9208838203848895, 0.9586600142551674, 0.9707769065012017, 0.9782608694377662]

Validation history:  [0.8894430587713855, 0.9201995015937094, 0.9368246051537822, 0.9376558603491272, 0.940980881130507]

ANN saved successfully in nn_detector_weights.hdf5!


### 2.1.3 Classification of newly generated news by the bot

In [81]:
#reloads network weights
filename = "nn_detector_weights.hdf5"
nn_model.load_weights(filename)
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

yTestNN = []
for i in range(0, len(text_generated)):
    news = [[text_generated[i], yTest[i]]]
    Test = pd.DataFrame(news, columns = ['News', 'Label'])
    sentences_test = Test.News
    y_test = Test.Label
    X_test = tok.texts_to_sequences(sentences_test)
    X_test = pad_sequences(X_test, padding='post', maxlen=max_length)
    
    loss, accuracy = nn_model.evaluate(X_test, y_test, verbose=False)
    
    if(accuracy < 0.5):
        if(yTest[i] == 0):
            yTestNN.append(1)
        else:
            yTestNN.append(0)
    else:
        yTestNN.append(yTest[i])
        
print('Classifications predicted by NN:', yTestNN)

Classifications predicted by NN: [0, 0, 0, 0, 0]


## 2.2 Fake News Detection Using Naive Bayes

In [82]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB

### 2.2.1 Data preprocessing

In [93]:
print("Please wait...\n")
#reads csv file
column= ['URLs', 'Headline', 'Body', 'Label']
dataset = pd.read_csv("data.csv",header=None, skiprows=1, names=column)
dataset['Body'] = dataset['Body'].astype('str')

#removes noises
final_words = [re.sub('[^0-9a-zA-Z .]+', '', word) for word in dataset['Body']]
dataset['Body'] = final_words

#training and test set seperation
Train_data= dataset.Body
labels = dataset.Label
X_train,X_test,y_train,y_test = train_test_split(Train_data,labels,test_size=0.30,random_state=0)

#toeknization
def myTokenizer(textData):
    stemmer = PorterStemmer()
    stemmedWords=[stemmer.stem(w) for w in nltk.word_tokenize(textData)]
    return  stemmedWords

#filters out only punctation symbols as stop words
punctuationList=list(string.punctuation)

#initializes Count Vectorizer
vect=CountVectorizer(tokenizer=myTokenizer, stop_words=punctuationList,
                     max_df=999, max_features=None, min_df=4, binary=True)
vect.fit(X_train)

#vocabulary_ is a mapping of terms to feature indices
print ("Total vocabulary: ",len(vect.vocabulary_))

XTrain = vect.transform(X_train)
print("XTrain shape:", XTrain.shape)
print("\nDone")

Please wait...

Total vocabulary:  11287
XTrain shape: (2806, 11287)

Done


### 2.2.2 Naive Bayes initialization and training

In [94]:
print("Please wait...\n")
#trains NaiveBayes on the document-term matrix and corresponding classes
clf = MultinomialNB(alpha=1.0)  #alpha 1 for laplace smoothing
clf.fit(XTrain, y_train)
print("Done")

Please wait...

Done


### 2.2.3 Classification of newly generated news by the bot

In [98]:
print("Please wait...\n")

yTestNB = []
cls_score = 0
for i in range(0, len(text_generated)):
    news = [[text_generated[i], yTest[i]]]
    Test = pd.DataFrame(news, columns = ['News', 'Labels'])
    X_test = Test.News
    
    y_test = Test.Labels
    X_test = vect.transform(X_test)

    cls_score = clf.score(X_test, y_test)
    if(cls_score < 0.5):
        if(yTest[i] == 0):
            yTestNB.append(1)
        else:
            yTestNB.append(0)
    else:
        yTestNB.append(yTest[i])
    
print('Classifications predicted by NB:', yTestNB)

Please wait...

Classifications predicted by NB: [0, 0, 0, 0, 0]


## 3. Results comparison

In [106]:
pd.DataFrame({'News text' : text_generated, 'Random Classes' : yTest, 'Classification by NN' : yTestNN, 'Classification by NB' : yTestNB}, 
                                columns=['News text', 'Random Classes', 'Classification by NN', 'Classification by NB'])

Unnamed: 0,News text,Random Classes,Classification by NN,Classification by NB
0,to learn in life and in baseball. youve had yo...,0,0,0
1,regular season. maxwell said his decision had...,1,0,0
2,ed so hard to avoid in high school. i respect ...,0,0,0
3,or late last year.for the full mint interview ...,1,0,0
4,job is to focus on playing baseball right now....,0,0,0
