<a href="https://colab.research.google.com/github/SumeetsRoorkee/ML_Code/blob/main/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
razanaqvi14_real_and_fake_news_path = kagglehub.dataset_download('razanaqvi14/real-and-fake-news')

print('Data source import complete.')


In [None]:
import nltk
nltk.download('punkt')

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# import keras
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')


In [None]:
news_true = pd.read_csv("/kaggle/input/real-and-fake-news/True.csv")
news_fake = pd.read_csv("/kaggle/input/real-and-fake-news/Fake.csv")

In [None]:
news_true.head()

In [None]:
news_fake.head()

In [None]:
news_true['isfake']=0
news_true.head()

In [None]:
news_fake['isfake']=1
news_fake.head()

In [None]:
news_true['original_text'] = news_true['title'] + " " + news_true['text']

In [None]:
news_true.head()

In [None]:
news_fake['original_text'] = news_fake['title'] + " " + news_fake['text']

In [None]:
news_fake.head()

In [None]:
news_true.shape


In [None]:
news_fake.shape

In [None]:
df = pd.concat([news_true, news_fake], ignore_index=True)
df

In [None]:
df_news = df.drop(['title','text','date', 'subject'], axis=1)

In [None]:
df_news.head()

In [None]:
def text_proper(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [None]:
df_news['text'] = df['original_text'].apply(text_proper)

In [None]:
df_news

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >3 and token not in stop_words:
            result.append(token)
    return result

In [None]:
df_news['tokens'] = df_news['text'].apply(preprocess)

In [None]:
print(df_news['tokens'][0])

In [None]:
# Obtain the total words present in the dataset
vocab = []
for i in df_news['tokens']:
    for j in i:
        vocab.append(j)

In [None]:
vocab

In [None]:
total_words = len(list(set(vocab)))
total_words

In [None]:
df_news['clean_joined'] = df_news['tokens'].apply(lambda x: " ".join(x))
df_news.head()

In [None]:
# plot the word cloud for text that is Fake
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df_news[df_news.isfake == 1].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df_news[df_news.isfake == 0].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
# length of maximum document will be needed to create word embeddings
maxlen = -1
for doc in df_news.clean_joined:
    tokens = nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen = len(tokens)
print(maxlen)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words=total_words)
tokenizer.fit_on_texts(df_news['clean_joined'])
sequences = tokenizer.texts_to_sequences(df_news['clean_joined'])


X = pad_sequences(sequences, maxlen=maxlen)
y = df['isfake'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# Sequential Model
model = Sequential()

# embeddidng layer
model.add(Embedding(total_words, output_dim = 64))
# model.add(Embedding(total_words, output_dim = 240))


# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))

# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
# train the model
model.fit(X_train, y_train, epochs=2, batch_size=64, validation_data=(X_test, y_test))

In [None]:
pred = model.predict(X_test)

In [None]:
# if the predicted value is >0.5 it is real else it is fake
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
# getting the accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(list(y_test), prediction)

print("Model Accuracy : ", accuracy)

In [None]:
# get the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (25, 25))
sns.heatmap(cm, annot = True)