# Fake news Detection

### Importing required library
Here I am importing some of the required library, if extra library is required to install It will be install later on.

In [100]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D,LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Read all the available datasets

In [77]:
df_true = pd.read_csv("datasets/FakeNews/True.csv")
df_fake = pd.read_csv("datasets/FakeNews/Fake.csv")
df_api =pd.read_csv("datasets/FakeNews/ApiNews.csv")

Inserting a column called "label" for fake and real news dataset to categories fake and true news. 

In [78]:
df_fake["label"] = 0
df_true["label"] = 1

In [79]:
df_true.head(5)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [80]:
df_fake.head(5)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [81]:
df_api.head(5)

Unnamed: 0,title,text,subject,date,label
0,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,The Bigg Boss 16 Finale turned out to be a gra...,general,2023-02-12,1
1,"US Shuts Airspace Over Lake Michigan, Cites ""N...",The airspace over Lake Michigan has been tempo...,general,2023-02-12,1
2,Google search chief warns against `hallucinati...,Google search chief warns against hallucinatin...,general,2023-02-12,1
3,Ayodhya to triple talaq verdict: All about ex-...,Former Supreme Court judge S Abdul Nazeer was ...,general,2023-02-12,1
4,"""Was Made An Offer In Jail Which..."": Maharash...",Nationalist Congress Party leader and former M...,general,2023-02-12,1


In [82]:
df_true.shape, df_fake.shape, df_api.shape

((21417, 5), (23481, 5), (7379, 5))

#### Merging All the Datasets

In [83]:
df_merge = pd.concat([df_true, df_fake, df_api], axis =0 )

In [84]:
print(df_merge.columns)
df_merge.shape

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


(52277, 5)

### Data Preprocessing

##### Drop rows with null values

In [85]:
df_merge['text'].isna().sum()

0

In [86]:
df_merge = df_merge.dropna(subset=['text'],axis=0)
df_merge.shape

(52277, 5)

##### Drop duplicates row

In [87]:
df_merge.duplicated(['text']).sum()

6252

In [88]:
df_merge.drop_duplicates(['text'],inplace=True)
df_merge.shape

(46025, 5)

#### Randomly shuffling the dataframe 

In [89]:
df_merge = df_merge.sample(frac = 1)
df_merge.head()

Unnamed: 0,title,text,subject,date,label
10640,AUSTRIA’S NOT PLAYING GAMES: Bans Face-Conceal...,Will Austria s new mandates help to save their...,politics,"Jun 12, 2017",0
4025,Trump’s Problems With Women Get Worse As Anot...,Donald Trump keeps saying that all the women w...,News,"October 27, 2016",0
6894,California prepares to fight Trump on immigrat...,"SACRAMENTO, Calif. (Reuters) - Lawmakers in De...",politicsNews,"December 6, 2016",1
19031,Iraqi soldiers join Turkish exercises near sha...,"HABUR, Turkey (Reuters) - Iraqi soldiers joine...",worldnews,"September 26, 2017",1
18450,U.S. calls on Russia to release Crimean dissid...,WASHINGTON (Reuters) - The United States was ...,worldnews,"October 3, 2017",1


In [90]:
df_merge.reset_index(inplace = True)
df_merge.drop(["index"], axis = 1, inplace = True)
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [91]:
df_merge.head()

Unnamed: 0,title,text,subject,date,label
0,AUSTRIA’S NOT PLAYING GAMES: Bans Face-Conceal...,Will Austria s new mandates help to save their...,politics,"Jun 12, 2017",0
1,Trump’s Problems With Women Get Worse As Anot...,Donald Trump keeps saying that all the women w...,News,"October 27, 2016",0
2,California prepares to fight Trump on immigrat...,"SACRAMENTO, Calif. (Reuters) - Lawmakers in De...",politicsNews,"December 6, 2016",1
3,Iraqi soldiers join Turkish exercises near sha...,"HABUR, Turkey (Reuters) - Iraqi soldiers joine...",worldnews,"September 26, 2017",1
4,U.S. calls on Russia to release Crimean dissid...,WASHINGTON (Reuters) - The United States was ...,worldnews,"October 3, 2017",1


#### Exporting 2000 news from the dataframe for manual testing
##### These news will not be used in training the model it will be dropped from the dataframe on exported

In [92]:
#Adding the first 2000 data to the testData.json file for manual testing
testData = df_merge.head(2000)
#Removing the first 5000 data from the datasets
df_merge = df_merge.iloc[2000:,]
#Exporting data as a json file
testData.reset_index(drop=True,inplace=True)
testData.sort_index(inplace=True)
testData.to_json("client/src/api/testData.json",orient ='records')
print("testData.json exported to client/src/api folder")
print(df_merge.shape)

testData.json exported to client/src/api folder
(44025, 5)


#### "title",  "subject" and "date" columns is not required for detecting the fake news, so I am going to drop the columns.
## Final Dataset is: df

In [93]:

df = df_merge.drop(["title", "subject","date"], axis = 1)
df.columns

Index(['text', 'label'], dtype='object')

In [94]:
df.head()

Unnamed: 0,text,label
2000,"WASHINGTON (Reuters) - As a candidate, U.S. Pr...",1
2001,<ol><li>'This is too much': B.C. mom records p...,1
2002,(Reuters) - The Senate Homeland Security Commi...,1
2003,"Meanwhile back in the good ole USA, liberals ...",0
2004,MOSCOW (Reuters) - Russia s FSB security servi...,1


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [95]:
import re
import string
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text
df["text"] = df["text"].apply(wordopt)


#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [96]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
# Defining dependent and independent variable as x and y
X = df["text"]
Y = df["label"]

In [98]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

### Feature Extraction

#### Tokenization
##### It is the process of dividing a text into smaller units (each word will be an index in an array)

In [101]:
# defining tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
# Converting text to sequence
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = 500
# padding
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
# Exporting Tokenizer
import joblib
joblib.dump(tokenizer,"models/fakeNews/tokenizer")

['models/fakeNews/tokenizer']

### Convolutional Neural Networks (CNNs)
#### CNNs are commonly used for text classification tasks such as fake news detection. They can learn to detect patterns and features in the text by using convolutional layers and pooling layers.

In [102]:
# Define the Model
CNN = Sequential()
CNN.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_len))
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation='relu'))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=1, activation='sigmoid'))
# Compile the model
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [103]:
# Train the model
CNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2781232f088>

In [104]:
# Print Accuracy and Confusion Matrix
y_pred = CNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
CNN.save('models/fakeNews/CNN.h5')

Accuracy: 98.83%
Confusion Matrix:  [[3303   61]
 [  42 5399]]


### Recurrent Neural Networks (RNNs)
#### RNNs are another popular choice for text classification tasks. They can process sequential data by using feedback loops, allowing them to capture the context and meaning of the text.

In [105]:
# Define the model
RNN = Sequential()
RNN.add(Embedding(5000, 128, input_length=X_train.shape[1]))
RNN.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
RNN.add(Dense(1, activation='sigmoid'))
# Compile the model
RNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [106]:
# Train the model
RNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x278123c1648>

In [107]:
# Print Accuracy and Confusion Matrix
y_pred = RNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
RNN.save('models/fakeNews/RNN.h5')

Accuracy: 89.72%
Confusion Matrix:  [[2851  513]
 [ 392 5049]]


In [108]:
def manual_testing(news):
    new_article=news
    new_article = wordopt(new_article)
    new_article = lemmatize_text(new_article)
    new_article = tokenizer.texts_to_sequences([new_article])
    padded = pad_sequences(new_article, padding='post', maxlen=500)
    pred_CNN = CNN.predict(padded)
    pred_RNN = RNN.predict(padded)
    return print("\n\nCNN Prediction: {} \nRNN Prediction: {}".format(pred_CNN,pred_RNN))

In [109]:
news = str(input())
manual_testing(news)



CNN Prediction: [[0.60015076]] 
RNN Prediction: [[0.8758223]]
