# Fake News Detection

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from bs4 import BeautifulSoup
import re, string
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from sklearn.metrics import confusion_matrix

In [None]:
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [None]:
true_df.head()

In [None]:
fake_df.head()

In [None]:
true_df['category'] = 1
fake_df['category'] = 0

In [None]:
df = pd.concat([true_df, fake_df])
df.head()

### Visualization

In [None]:
df['category'].value_counts()

In [None]:
sns.countplot(df['category'], label='Count') 

In [None]:
df['subject'].value_counts()

In [None]:
sns.countplot(x='subject', hue='category', data=df)

In [None]:
df['text'] = df['text'] + " " + df['title']

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    punctuation = list(string.punctuation)
    stop.update(punctuation)
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

def clean_text(text):
    text = strip_html(text)
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = remove_stopwords(text)
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
def show_wc(text):
    plt.figure(figsize=(20,20))
    wc = WordCloud(max_words=2000, width=1600, height=800, stopwords=STOPWORDS).generate(text)
    plt.imshow(wc, interpolation = 'bilinear')

In [None]:
# True
show_wc(" ".join(df[df.category == 1].text))
# Fake
show_wc(" ".join(df[df.category == 0].text))

In [None]:
def get_top_text_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def show_ngrams(n):
    plt.figure(figsize=(16,9))
    most_common = get_top_text_ngrams(df.text, 10, n)
    most_common = dict(most_common)
    sns.barplot(x=list(most_common.values()), y=list(most_common.keys()))

In [None]:
# Unigram
show_ngrams(1)

In [None]:
# Bigram
show_ngrams(2)

In [None]:
# Trigram
show_ngrams(3)

### Preprocess

In [None]:
X = df['text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
vocab_size = 10000
embedding_dim = 100
max_length = 300
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 40000 

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

X_train = training_padded
X_test = testing_padded

In [None]:
EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

nb_words = min(vocab_size, len(word_index))
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= vocab_size: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        

### Models Training + Evaluation

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True, recurrent_dropout=0.25, dropout=0.25))
model.add(LSTM(64, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train,  y_train, validation_data=(X_test, y_test), epochs=15)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc)) 

plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.title('Training and validation loss')

In [None]:
y_pred = model.predict_classes(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(14,10))
sns.heatmap(cm, cmap='Blues', linecolor='black', linewidth=1, annot=True, fmt='', xticklabels=['Fake', 'Original'], yticklabels=['Fake', 'Original'])
plt.xlabel("Predicted")
plt.ylabel("Actual")

In [None]:
'''
Inspiration
1. https://www.kaggle.com/madz2000/nlp-using-glove-embeddings-99-87-accuracy
'''