Tasks:


1. Data Preprocessing of text
2. Exploratory Data Analysis
3. Visualization
4. Model Selection and Evaluation
5. Data Pipeline
6. Model Deployment
7. Consolidation and Discussion

### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
import re
import string
from wordcloud import WordCloud, STOPWORDS
from spacy import displacy
from pathlib import Path
from textblob import TextBlob
from tqdm import tqdm, tqdm_notebook
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import confusion_matrix, f1_score, classification_report

import io
import os
import shutil
import zipfile
import urllib
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM, Dropout, Bidirectional, Input, Conv1D, MaxPooling1D
from tensorflow.keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
fake_news = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

fake_news.head()

In [3]:
fake_news.info()

In [4]:
def operate_on_word(text):
    text = re.sub('\w*\d\w*', '', 
                re.sub('\n', '',
              re.sub('[%s]' % re.escape(string.punctuation), '', 
                re.sub('<.*?>+', '', 
                       re.sub('https?://\S+|www\.\S+', '', 
                              re.sub("\\W", ' ', 
                                     re.sub('\[.*?\]', '', text.lower())))))))
    return text

In [5]:
fake_corpus = ' '.join(fake_news.text.apply(operate_on_word))

len(fake_corpus)

In [6]:
first_fake_news = fake_news['text'][0]

In [7]:
# Generate WORDCLOUD

stopwords = set(STOPWORDS)
cloud_news = WordCloud(width=800,
                       height=800,
                       stopwords=stopwords,
                       background_color='white',
                       min_font_size=10).generate(fake_corpus)

fig, ax = plt.subplots(figsize=(12, 8))
ax.imshow(cloud_news, interpolation='bilinear')
ax.axis('off')
plt.show()
# fig.savefig('WordCloud.png')

In [8]:
nlp = spacy.load('en_core_web_sm')
title_doc1 = nlp('NLP for Fake News Detection')
# sentences_spans = list(doc.sents)

displacy.render(title_doc1, style='dep', jupyter=True)

# output_path = Path("dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
# output_path.open("w", encoding="utf-8").write(svg)

In [9]:
genuine_news = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

genuine_news.head()

In [10]:
# Names Entity Visualization

# genuine_news.text[(478 < genuine_news.text.apply(len)) & (genuine_news.text.apply(len) < 480)].index

indices = [484, 968,  1257,  3799, 7603, 7628,  9147, 12196, 12735,
            16536, 17528, 19804, 20393]
displacy.render(nlp(genuine_news.text[np.random.choice(indices, 1)[0]]), style='ent', jupyter=True)

# output_path = Path("named_entity_plot.svg")  
# output_path.open("w", encoding="utf-8").write(svg)

In [11]:
fake_news['genuine'] = 0

fake_news = fake_news.drop(fake_news[fake_news['text'] == ' '].index)

print(len(fake_news))

fake_deploy = fake_news.sample(n=5)

print(fake_deploy.head())

fake_news = fake_news.drop(fake_deploy.index)

# fake_news['text'] =fake_news.text.apply(operate_on_word)
# fake_news['title'] =fake_news.title.apply(operate_on_word)

print(len(fake_news))

part_1 = fake_news[['title', 'text', 'genuine']]

In [12]:
genuine_news['genuine'] = 1

genuine_news = genuine_news.drop(genuine_news[genuine_news['text'] == ' '].index)

print(len(genuine_news))

genuine_deploy = genuine_news.sample(n=5)

genuine_news = genuine_news.drop(genuine_deploy.index)

# genuine_news['text'] = genuine_news.text.apply(operate_on_word)
# genuine_news['title'] = genuine_news.title.apply(operate_on_word)

print(len(genuine_news))

part_2 = genuine_news[['title', 'text', 'genuine']]

In [13]:
all_parts = pd.concat([part_1, part_2], ignore_index=True, axis = 0)

all_parts.head()

In [14]:
counts_df = all_parts['genuine'].value_counts().to_frame()

counts_df.index = ['Fake', 'Genuine']
counts_df.columns = ['Count']

counts_df.head()

In [15]:
# Shuffle Dataset
df = all_parts.sample(frac=1).reset_index(drop=True)

df.head()

In [16]:
df.genuine.value_counts()

In [17]:
# Feature Engineering

df['polarity'] = df['text'].map(lambda text: TextBlob(str(text)).sentiment.polarity)
df['text_len'] = df['text'].astype(str).map(len)
df['text_word_count'] = df['text'].map(lambda x: len(str(x).split()))
df['title_len'] = df['title'].astype(str).map(len)
df['title_word_count'] = df['title'].map(lambda x: len(str(x).split()))

In [18]:
# df2.to_csv('/content/drive/MyDrive/News_NLP/news_cleaned_with_feats.csv',index=False)

In [19]:
def get_top_n_words(corpus, name='text', n=None):
    corpus_fake = corpus[df['genuine'] == 1].astype(str)
    corpus_true = corpus[df['genuine'] == 0].astype(str)

    vec = CountVectorizer(stop_words = 'english').fit(corpus_fake)
    bag_of_words = vec.transform(corpus_fake)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq_fake = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_fake =sorted(words_freq_fake, key = lambda x: x[1], reverse=True)

    vec = CountVectorizer(stop_words = 'english').fit(corpus_true)
    bag_of_words = vec.transform(corpus_true)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq_true = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_true =sorted(words_freq_true, key = lambda x: x[1], reverse=True)

    df_fake = pd.DataFrame(words_freq_fake[:n],columns = ['text', 'count'])
    df_true = pd.DataFrame(words_freq_true[:n],columns = ['text', 'count'])

    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,8))
    ax1.bar(df_fake['text'], df_fake['count'])
    ax1.set_xticklabels(df_fake['text'])
    ax1.set(xlabel='top 10 most frequent terms for fake news', ylabel='count')
    ax2.bar(df_true['text'], df_true['count'])
    ax2.set_xticklabels(df_true['text'])
    ax2.set(xlabel='top 10 most frequent terms for true news', ylabel='count')
    plt.suptitle('Comparision between the top 10 most frequent terms (fake/true)')

    fig.savefig(f'most_freq_{name}.png')


In [20]:
df2 = df.copy(deep=True)

df2['text'] = df2.text.apply(operate_on_word)
df2['title'] = df2.title.apply(operate_on_word)

get_top_n_words(corpus=df2['text'], name='text', n=10)

In [21]:
get_top_n_words(corpus=df2['title'], name='title', n=10)

#### Based on the comparison between the top 10 frequent words in titles and news text, we can infer that both fake and true news is dominated by news relating to politics and more specifically, the subject being heavily related to American politics is shared between true and fake news. This would result in the model been biased to classifying news that relates to only American Politics and probably of that time frame. To mitigate this bias more recent data and diverse news data would be needed

In [22]:
X = df['text']
y = df['genuine']

X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size=0.05, stratify=y)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, test_size=0.2)

print(X_train.shape[0], X_valid.shape[0], X_test.shape[0])

In [23]:
def transform_word(texts):
    texts_new = []
    for text in texts:
        text = re.sub('\w*\d\w*', '', 
                  re.sub('\n', '',
                re.sub('[%s]' % re.escape(string.punctuation), '', 
                  re.sub('<.*?>+', '', 
                        re.sub('https?://\S+|www\.\S+', '', 
                                re.sub("\\W", ' ', 
                                      re.sub('\[.*?\]', '', text.lower())))))))
        texts_new.append(text)
    return np.array(texts_new)

In [24]:
def load_classifier(clf, X_train, X_valid, y_train, y_valid):
    pipe_clf = make_pipeline(FunctionTransformer(transform_word), 
                             TfidfVectorizer(ngram_range=(1, 2), max_features=5000),
                             clf)
    pipe_clf.fit(X_train, y_train)
    y_pred = pipe_clf.predict(X_valid)
    probas = pipe_clf.predict_proba(X_valid)
    
    return pipe_clf, y_pred, probas

In [25]:
classifiers = [MultinomialNB(), LogisticRegression(),
               RandomForestClassifier(), GradientBoostingClassifier()]

model_list, preds_list, probas_list = [], [], []

for clf in classifiers:
    model, pred, probas = load_classifier(clf, X_train_all, X_test, y_train_all, y_test)
    model_list.append(model)
    preds_list.append(pred)
    probas_list.append(probas)
    


In [26]:
def plot_confusion_matrices(model_list, preds_list, y_valid):
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(14, 8))
    
    axes = [ax1, ax2, ax3, ax4]
    
    for ax, y_pred, model in zip(axes, preds_list, model_list):
        cm = confusion_matrix(y_valid, y_pred)

        cm_matrix = pd.DataFrame(data=cm, columns=['Predict Fake', 'Predict True'], 
                                     index=['Actual Fake', 'Actual True'])

        sns.heatmap(cm_matrix, annot=True, fmt='d', ax =ax, cmap='YlGnBu')
        ax.set_title(model.steps[-1][0])
        
    plt.show()
        

In [27]:
plot_confusion_matrices(model_list, preds_list, y_test)
# model_list = [naive_model, logreg, rf_clf, svm_clf]

# plot_confusion_matrices(model_list)

In [28]:
model_file_list = []
for model in model_list:
    model_name = model.steps[-1][0]
    filename = f'{model_name}_model.pkl'
    model_file_list.append(filename)
    pickle.dump(model, open(filename, 'wb'))

In [29]:
X2 = df2['text']
y2 = df2['genuine']

X_train2_all, X_test2, y_train2_all, y_test2 = train_test_split(X2, y2,
                                                            test_size=0.05,
                                                            stratify=y, random_state=0)

X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_train2_all,
                                                      y_train2_all,
                                                      test_size=0.2, random_state=0)

print(X_train2.shape[0], X_valid2.shape[0], X_test2.shape[0])

In [30]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train2)

max_seq=400

X_train_tf = tokenizer.texts_to_sequences(X_train2)
X_train_tf = pad_sequences(X_train_tf, maxlen=max_seq)

X_valid_tf = tokenizer.texts_to_sequences(X_valid2)
X_valid_tf = pad_sequences(X_valid_tf, maxlen=max_seq)

# encoder = TextVectorization(max_tokens=10000)
# encoder.adapt(X_train)

# vocab = encoder.get_vocabulary()

# vocab[:10]
word_index = tokenizer.word_index

In [31]:
embedding_dim=64

model = Sequential([
  Input(shape=(max_seq,), dtype='int32'),
  Embedding(input_dim = len(word_index) + 1, output_dim = embedding_dim, name="embedding"),
  Conv1D(32, kernel_size=3, padding='same'),
  MaxPooling1D(pool_size=2),
  Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
  Bidirectional(LSTM(32, dropout=0.2)),
  Dense(64, activation='relu'),
  Dropout(0.5),
  Dense(1, activation='sigmoid')
])

In [32]:
model.summary()

In [33]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss')

checkpoint_filepath = '/tmp/checkpoint'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


In [34]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [None]:
history = model.fit(X_train_tf,
                    y_train2,
                    validation_data=(X_valid_tf, y_valid2),
                    epochs = 10,
                    callbacks=[tensorboard_callback,
                               early_stopping_monitor,
                               model_checkpoint_callback])

In [None]:
os.listdir(checkpoint_dir)


In [None]:
# Loads the weights
model.load_weights(checkpoint_path)

# Re-evaluate the model
loss, acc = model.evaluate(test_images, test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))


model.save_weights('./checkpoints/my_checkpoint')

In [None]:
X_test_tf = tokenizer.texts_to_sequences(X_test2)
X_test_tf = pad_sequences(X_test_tf, maxlen=max_seq)
pred_tf = model.predict(X_test_tf)

pred_tf = np.array([1 if pred >=0.5 else 0 for pred in pred_tf])

unique, counts = np.unique(pred_tf, return_counts=True)

print(np.asarray((unique, counts)).T)
# pred_tf[:20]

In [None]:
class_rep_tf = classification_report(y_test2, pred_tf)
cm_tf = confusion_matrix(y_test2, pred_tf)

cm_matrix_tf = pd.DataFrame(data=cm_tf, columns=['Predict Fake', 'Predict True'], 
                                 index=['Actual Fake', 'Actual True'])

print(class_rep_tf)

sns.heatmap(cm_matrix_tf, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs


In [None]:
weights = model.get_layer('embedding').get_weights()[0]



In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(word_index):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()


In [None]:
try:
    from google.colab import files
    files.download('vectors.tsv')
    files.download('metadata.tsv')
except Exception:
    pass
