In [0]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,Embedding
from keras.layers.recurrent import LSTM
from keras.callbacks import EarlyStopping

In [0]:
from google.colab import drive
drive.mount('/content/drive/')


In [0]:
!ls "/content/drive/My Drive/fake news detection/"

df,df1,df2,df3 are the dataframes that contain data for fake news prediction

In [0]:
df = pd.read_csv("/content/drive/My Drive/fake news detection/data.csv")
df1 = pd.read_csv("/content/drive/My Drive/fake news detection/fake.csv")
df2 = pd.read_csv("/content/drive/My Drive/fake news detection/fake_or_real_news.csv")
df3 = pd.read_csv('/content/drive/My Drive/fake news detection/train.csv')

Bringing all the dataframes in a format such that all the dataframes have same columns. Only keeping the columns which contain the title and text of the news and removing all other from the dataframes.

In [0]:
df.drop(['URLs'], axis = 1, inplace = True)
df.loc[df['Label']== 0, 'Label'] = 'REAL'
df.loc[df['Label']== 1, 'Label'] = 'FAKE'

#joining the title and text columns

df["matter"] = df["Headline"].map(str) + df["Body"]
df.drop(["Headline","Body"],axis = 1,inplace = True)

In [0]:
df1 = df1.loc[df1['type']=='fake']
df1.loc[df1['type']== 'fake', 'type'] = 'FAKE'

df1.drop(['uuid', 'ord_in_thread', 'author', 'published', 
       'language', 'crawled', 'site_url', 'country', 'domain_rank',
       'thread_title', 'spam_score', 'main_img_url', 'replies_count',
       'participants_count', 'likes', 'comments', 'shares'],axis = 1,inplace = True)

df1 = df1.rename(columns={'type': 'Label'})

#joining the title and text columns

df1["matter"] = df1["title"].map(str) + df1["text"]
df1.drop(["title","text"],axis = 1,inplace = True)
df1.head(2)

In [0]:
df2.drop(["Unnamed: 0"],axis = 1,inplace = True)

df2 = df2.rename(columns = {"label":"Label"})

#joining the title and text columns

df2["matter"] = df2["title"].map(str) + df2["text"]
df2.drop(["title","text"],axis = 1,inplace = True)
df2.head(2)

In [0]:
df3.drop(["id","author"],axis = 1,inplace = True)

df3 = df3.rename(columns = {"label":"Label"})

#joining the title and text columns

df3["matter"] = df3["title"].map(str) + df3["text"]
df3.drop(["title","text"],axis = 1,inplace = True)

df3.loc[df3['Label']== 0, 'Label'] = 'REAL'
df3.loc[df3['Label']== 1, 'Label'] = 'FAKE'

In [0]:
dff = pd.concat([df,df1,df2,df3], ignore_index=True)
dff.head(2)

In [0]:
#dropping out the duplicate entries from the dataframe
dff = dff.drop_duplicates()

some text preprocessing

In [0]:
from pandas.api.types import is_string_dtype
is_string_dtype(dff["matter"])

Lowering the text content and removing punctuations from the data content(matter column)

In [0]:
dff["matter"] = dff["matter"].str.lower()
dff.head()

In [0]:
import string
punctuation_to_be_removed = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', punctuation_to_be_removed))
dff["matter_without_punctuation"] = dff["matter"].astype(str).apply(lambda text: remove_punctuation(text))
dff.head()  

In [0]:
def remove_digits(text):
  return ''.join(i for i in text if not i.isdigit())
dff["matter_without_digits"] = dff["matter_without_punctuation"].apply(lambda text: remove_digits(text))
dff.head()

Removing stopwords from the data content

In [0]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stop_words])
dff["matter_without_stop_words"] = dff["matter_without_digits"].apply(lambda text: remove_stopwords(text))
dff.head(2)

Removing frequent and rare words from the data content

In [0]:
import collections
from collections import Counter
count = Counter()
for text in dff["matter_without_stop_words"].values:
    for word in text.split():
        count[word] += 1
count.most_common(5)

#removing 7 most frequent words

frequent_words = set([w for (w, wc) in count.most_common(7)])
def remove_frequentwords(text):
    
    return " ".join([word for word in str(text).split() if word not in frequent_words])

dff["matter1"] = dff["matter_without_stop_words"].apply(lambda text: remove_frequentwords(text))

#removing 7 most rare words

no_of_rare_words = 7
rare_words = set([w for (w, wc) in count.most_common(10)[:-no_of_rare_words-1:-1]])
def remove_rarewords(text):
    
    return " ".join([word for word in str(text).split() if word not in rare_words])

dff["matter2"] = dff["matter1"].apply(lambda text: remove_rarewords(text))
dff.head(2)

In [0]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
dff["final_news_data"] = dff["matter2"].apply(lambda text: lemmatize_words(text))

In [0]:
dff

In [0]:
dff.drop(["matter","matter_without_punctuation","matter_without_digits","matter_without_stop_words","matter1","matter2"],axis = 1,inplace = True)

In [0]:
dff

In [0]:
maximum_nb_words = 500
maximum_sequence_length= 60
embedding_dimension = 128
tokenizer = Tokenizer(num_words=maximum_nb_words,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(dff["final_news_data"].values)
word_index = tokenizer.word_index
print('no of unique token %s' % len(word_index))

In [0]:
X = tokenizer.texts_to_sequences(dff["final_news_data"].values)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=maximum_sequence_length)
print('Shape of X is:', X.shape)

In [0]:
X[3]

In [0]:
dff.loc[dff['Label']== "REAL", 'Label'] = 0
dff.loc[dff['Label']== "FAKE", 'Label'] = 1

In [0]:
Y = dff["Label"]

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [0]:
"""from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X_train_new = pca.fit_transform(X_train)
X_test_new = pca.transform(X_test)"""

In [0]:
X_train.shape

In [0]:
X_train

In [0]:
X.shape[1]

Using neural network for classification

In [0]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
model = Sequential()
model.add(Embedding(maximum_nb_words,embedding_dimension, input_length=X_train.shape[1]))
model.add(Dropout(0.6))
model.add(Conv1D(filters=128, kernel_size=5, padding='valid', activation='sigmoid',strides = 1))
model.add(Dense(256,activation = "sigmoid"))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.6))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
epochs = 10
batch_size = 500
model.compile(optimizer = "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
history = model.fit(X_train,Y_train, epochs = epochs,
                    validation_split = 0.3,batch_size = batch_size,verbose = 1)

In [0]:
def plot_learning_curves(history,epochs):
  epoch_range = range(1,epochs+1)
  #for training and validation accuracy
  plt.plot(epoch_range,history.history["acc"])
  plt.plot(epoch_range,history.history["val_acc"])
  plt.title("ACCURACY(model)")
  plt.xlabel("epochs")
  plt.ylabel("accuracy")
  plt.legend(["train","val"],loc = "upper left")
  plt.show()
  #for training and validation loss
  plt.plot(epoch_range,history.history["loss"])
  plt.plot(epoch_range,history.history["val_loss"])
  plt.title("LOSS(model)")
  plt.xlabel("epochs")
  plt.ylabel("loss")
  plt.legend(["train","val"],loc = "upper left")
  plt.show()

In [0]:
plot_learning_curves(history,epochs)

In [0]:
test_loss,test_acc = model.evaluate(X_test,Y_test)