In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D,LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Loading Datasets for Training

In [2]:
df_true = pd.read_csv("datasets/FakeNews/True.csv")
df_fake = pd.read_csv("datasets/FakeNews/Fake.csv")
df_combined = pd.read_csv("datasets/FakeNews/Combined.csv")
df_api =pd.read_csv("datasets/FakeNews/ApiNews.csv")

In [3]:
#Set the labels as 0 and 1 to fake and real news datasets
df_fake["label"] = 0
df_true["label"] = 1

In [4]:
df_true.head(5)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [5]:
df_fake.head(5)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [6]:
df_combined.head(5)

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# Merge the title and text of the api news beacuse it has small texts
df_api['text']=df_api['title'].fillna('') +" " + df_api['text'].fillna('')
df_api.head(5)

Unnamed: 0,title,text,subject,date,label
0,"Averse to certain foods? Beware, you could be ...","Averse to certain foods? Beware, you could be ...",general,2023-02-12,1
1,Interplanetary space station “Luna-25” is read...,Interplanetary space station “Luna-25” is read...,general,2023-02-12,1
2,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,Bigg Boss 16 Winner: MC Stan lifts the trophy ...,general,2023-02-12,1
3,"US Shuts Airspace Over Lake Michigan, Cites ""N...","US Shuts Airspace Over Lake Michigan, Cites ""N...",general,2023-02-12,1
4,Google search chief warns against `hallucinati...,Google search chief warns against `hallucinati...,general,2023-02-12,1


In [8]:
df_true.shape, df_fake.shape, df_combined.shape, df_api.shape

((21417, 5), (23481, 5), (20133, 3), (8009, 5))

#### Merging All the Datasets

In [9]:
df_marge = pd.concat([df_true, df_fake, df_combined,df_api], axis =0 )
df_marge.shape

(73040, 5)

In [10]:
# dropping unwanted columns
df = df_marge.drop(["title", "subject","date"], axis = 1)
df.columns

Index(['text', 'label'], dtype='object')

In [11]:
df.shape

(73040, 2)

### Data Preprocessing

##### Drop duplicates row

In [12]:
df.duplicated().sum()

6528

In [13]:
df.drop_duplicates(inplace=True)
df.shape

(66512, 2)

##### Drop rows with null values

In [14]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [15]:
df = df.dropna(axis=0)
df.shape

(66512, 2)

#### Final Datasets is df

In [16]:
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [17]:
# Randomly shuffling the dataframe 
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,label
4223,BEIJING (Reuters) - Chinese Foreign Minister W...,1
5324,"CNN s supposed law enforcement expert, Harry H...",0
13381,SAN FRANCISCO (AP) — Apple penalized CEO Ti...,0
14575,Islamic terrorism is the #1 issue with most Am...,0
5986,"The cops should leave this woman, and all of h...",0


In [18]:
# Removing index column which was generated after reshuffling the dataframe
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.columns

Index(['text', 'label'], dtype='object')

In [19]:
df.head()

Unnamed: 0,text,label
0,BEIJING (Reuters) - Chinese Foreign Minister W...,1
1,"CNN s supposed law enforcement expert, Harry H...",0
2,SAN FRANCISCO (AP) — Apple penalized CEO Ti...,0
3,Islamic terrorism is the #1 issue with most Am...,0
4,"The cops should leave this woman, and all of h...",0


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [20]:
import re
import string
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text
df["text"] = df["text"].apply(wordopt)


#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [21]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Defining dependent and independent variable as x and y
X = df["text"]
Y = df["label"]

In [23]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

### Feature Extraction

#### Tokenization
##### It is the process of dividing a text into smaller units (each word will be an index in an array)

In [24]:
# defining tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
# Converting text to sequence
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = 500
# padding
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
# Exporting Tokenizer
import joblib
joblib.dump(tokenizer,"models/fakeNews/tokenizer")

['models/fakeNews/tokenizer']

### Convolutional Neural Networks (CNNs)
#### CNNs are commonly used for text classification tasks such as fake news detection. They can learn to detect patterns and features in the text by using convolutional layers and pooling layers.

In [25]:
# Define the Model
CNN = Sequential()
CNN.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_len))
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation='relu'))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=1, activation='sigmoid'))
# Compile the model
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
# Train the model
CNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1df870a9b48>

In [27]:
# Print Accuracy and Confusion Matrix
y_pred = CNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
CNN.save('models/fakeNews/CNN.h5')

Accuracy: 95.23%
Confusion Matrix:  [[5186  322]
 [ 313 7482]]


### Recurrent Neural Networks (RNNs)
#### RNNs are another popular choice for text classification tasks. They can process sequential data by using feedback loops, allowing them to capture the context and meaning of the text.

In [28]:
# Define the model
RNN = Sequential()
RNN.add(Embedding(5000, 128, input_length=X_train.shape[1]))
RNN.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
RNN.add(Dense(1, activation='sigmoid'))
# Compile the model
RNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
# Train the model
RNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1df86e60088>

In [30]:
# Print Accuracy and Confusion Matrix
y_pred = RNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
RNN.save('models/fakeNews/RNN.h5')

Accuracy: 94.44%
Confusion Matrix:  [[5259  249]
 [ 491 7304]]


In [31]:
def manual_testing(news):
    new_article=news
    new_article = wordopt(new_article)
    new_article = lemmatize_text(new_article)
    new_article = tokenizer.texts_to_sequences([new_article])
    padded = pad_sequences(new_article, padding='post', maxlen=500)
    pred_CNN = CNN.predict(padded)
    pred_RNN = RNN.predict(padded)
    return print("\n\nCNN Prediction: {} \nRNN Prediction: {}".format(pred_CNN,pred_RNN))

In [32]:
news = str(input())
manual_testing(news)



CNN Prediction: [[0.7412483]] 
RNN Prediction: [[0.9241299]]
