In [None]:
#To work with nd-arrays
import numpy as np

#To work with data structures
import pandas as pd

#To plot graphs within terminal(for Jupyter Notebooks only)
%matplotlib inline

#To compute accuracy for models
from sklearn.metrics import accuracy_score

#To split dataset into training and validation
from sklearn.model_selection import train_test_split

#Importing Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Preprocessing Text documents(articles)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#To build Convolution Neural Network
from keras import layers
from keras.models import Sequential

#To plot graphs
import matplotlib.pyplot as plt

In [None]:
#Defining constants for easy usage
MAX_SEQUENCE_LENGTH = 5000
MAX_NUM_WORDS = 25000
TEST_SPLIT = 0.2
EMBEDDING_DIM = 300

TEXT_DATA = 'data/fake_or_real_news.csv'

In [None]:
#Function defined to evaluate models
def evaluate_model(pred_func, X_train, y_train, X_test, y_test):
    #Training Accuracy
    y_predict_train = pred_func(X_train)
    train_acc = accuracy_score(y_train,y_predict_train)
    
    #Testing Accuracy
    y_predict_test = pred_func(X_test)
    test_acc = accuracy_score(y_test,y_predict_test)
    
    return train_acc, test_acc

In [None]:
#Loading dataset
df = pd.read_csv(TEXT_DATA)
#Dropping variables that are irrelevant to our study
df.drop(labels=['id','title'], axis='columns', inplace=True)
#Filtering out articles with no text
mask = list(df['text'].apply(lambda x: len(x) > 0))
df = df[mask]
df.head(5)

In [None]:
#Ensuring all text samples have their labels
texts = df['text']
labels = df['label']

print('Found %s texts.' %texts.shape)
print('Found %s labels.' %labels.shape)

In [None]:
#Finding word count of each article
text_lengths = texts.apply(lambda x: len(x.split(" ")))
print("\nMaximum number of words in an article")
print(text_lengths.max())

#Histogram plot for word count
print("\nHistogram plot for articles upto 5000 words")
plt.hist(text_lengths, bins=[0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000])
plt.ylabel("Article count")
plt.xlabel("No of words")
plt.show()

## Method I - Naive Bayes

In [None]:
# set up vector models for training and testing
from sklearn.feature_extraction.text import CountVectorizer

# data vectorizer
    #max_df/min_df = int->no of documents ; float->percentage among total documents
    #stop_words = english-> inbulit stop words list for english is used
    #binary = True-> if a word occurs even once, assigns '1'
    #analyzer = features are taken as words    
vectorizer = CountVectorizer(analyzer = "word", 
                             binary = True, 
                             min_df = 2,
                             stop_words='english')
#Vectorizer is fit for the dataset
docarray = vectorizer.fit_transform(texts).toarray()
#Displaying output of Count Vectorization as a dataframe
    #vectorizer.vocabulary_ -> returns feature names(words)
docterm = pd.DataFrame(docarray, columns=vectorizer.vocabulary_)
print("\nAfter Count Vectorization\n")
print(docterm)

In [None]:
# create training and test data
    #One hot encoding the categorical dependent labels
docterm_train, docterm_test, y_train, y_test = train_test_split(docterm, labels.apply(lambda x: 0 if x == 'FAKE' else 1), test_size=TEST_SPLIT)

In [None]:
#Creating a Multinomial Naive Bayes model
    #Calculates probability of a word occuring in each class(FAKE/REAL) based on given input
    #Considers each word as an independent feature
model = MultinomialNB()
model.fit(docterm_train, y_train)

#Computing Training and validation accuracy
train_acc, test_acc = evaluate_model(model.predict, docterm_train, y_train, docterm_test, y_test)
print("Training Accuracy: {:.2f}".format(train_acc*100))
print("Testing Accuracy: {:.2f}".format(test_acc*100))

## Method II - Convolutional DNN

In [None]:
#To keep only "MAX_NUM_WORDS - 25000" most common words
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
#Creating Vocabulary index based on word frequency; lower the index, higher the frequency
tokenizer.fit_on_texts(texts)
#Replacing words with corresponding word index taken from fit_on_texts
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print("\nUnique words found in the dataset are listed below arranged according to most occurence frequency\n")
print(word_index)
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1

'''To make each sequence in list to have "MAX_SEQUENCE_LENGTH - 5000" values by padding 0's in front of each sequence 
and truncating words in front if sequence has over 5000 values'''
    
data = pad_sequences(sequences, 
                     maxlen=MAX_SEQUENCE_LENGTH, 
                     padding='pre', 
                     truncating='pre')

print('Found {} unique tokens.' .format(len(word_index)))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
# create training and test data
    #One hot encoding the categorical dependent labels
x_train, x_val, y_train, y_val = train_test_split(data, 
                                                  labels.apply(lambda x: 0 if x == 'FAKE' else 1), 
                                                  test_size=TEST_SPLIT)

In [None]:
#Building the CNN model
model = Sequential(
    [
        #representing text as continous vector represenrations and perform word embeddings - find similar words
        layers.Embedding(num_words, #Size of vocabulary
                         EMBEDDING_DIM,
                         input_length = MAX_SEQUENCE_LENGTH, #Length of each sequence(article)
                         trainable=True), #To update weights during training)
        
        #Conv1D useful for NLP
        #number of output filters = 128 
        #window size = 5; 5 words are considered at a time
        layers.Conv1D(128, 5, activation='relu'),
        
        #Pooling done to reduce spatial size of representation and reduce computations in neural networks
        layers.GlobalMaxPooling1D(),
        
        #Forming a fully connected hidden layer
        layers.Dense(128, activation='relu'),
        #Forming a fully connected output layer
        layers.Dense(1, activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop', #restricts oscillations
              metrics=['accuracy'])

model.summary()

In [None]:
# train the model

history = model.fit(x_train, 
                    y_train,
                    batch_size=512,
                    epochs=5,
                    verbose=1,
                    validation_data=(x_val, y_val))

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# evaluate model
train_acc, test_acc = evaluate_model(model.predict(),
                                     x_train, 
                                     y_train, 
                                     x_val, 
                                     y_val)
print("Training Accuracy: {:.2f}".format(train_acc*100))
print("Testing Accuracy: {:.2f}".format(test_acc*100))