In [1]:
import pandas as pd
import numpy as np

In [2]:
fake_news_dataset = pd.read_csv('/kaggle/input/fake-news-detection/fake.csv')
true_news_dataset = pd.read_csv('/kaggle/input/fake-news-detection/true.csv')

In [3]:
fake_news_dataset.info(),true_news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


(None, None)

We have a well balanced dataset 

In [4]:
fake_news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [5]:
true_news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [6]:
fake_news_dataset.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


In [7]:
fake_news_dataset['subject'].value_counts()

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [8]:
true_news_dataset['subject'].value_counts()

subject
politicsNews    11272
worldnews       10145
Name: count, dtype: int64

Since the date column won't help us much and there is great imbalance among the subject in fake news , we will be dropping the subject and date column

In [9]:
fake_news_dataset.drop(['subject','date'],axis=1,inplace=True)
true_news_dataset.drop(['subject','date'],axis=1,inplace=True)

In [10]:
fake_news_dataset['value'] = 0
true_news_dataset['value'] = 1

In [11]:
news_dataset = pd.concat([fake_news_dataset,true_news_dataset ], ignore_index=True)

In [12]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   value   44898 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [13]:
news_dataset.sample(5)

Unnamed: 0,title,text,value
5627,Karma In All Its Glory: Republican Chairwoman...,"A little over a month ago, Linda Sorenson, the...",0
7372,You’re Not Going To Believe What Trump Just S...,Republican front runner Donald Trump has stopp...,0
20077,WATCH: HARRY REID Caught Calling Benghazi Moth...,Harry Reid s disrespectful comments are just a...,0
26160,Trump to Republican senators: Don't leave town...,WASHINGTON (Reuters) - U.S. President Donald T...,1
19268,HERE’S WHAT Feminists Left Behind After Their ...,,0


In [14]:
news_dataset['Combined_news'] = news_dataset['title'] + news_dataset['text']
news_dataset.drop(['title','text'], inplace = True, axis = 1)

In [15]:


import re
import spacy
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm")

def preprocessing(data):
    text = []
    stop_words = set(stopwords.words('english'))
    for i in data:
        i = i.lower()
        # get rid of urls
        i = re.sub('https?://\S+|www\.\S+', '', i)
        # get rid of non words and extra spaces
        i = re.sub('\\W', ' ', i)
        i = re.sub('\n', '', i)
        i = re.sub(' +', ' ', i)
        i = re.sub('^ ', '', i)
        i = re.sub(' $', '', i)
        
        # Lemmatization using spaCy
        doc = nlp(i)
        tokens = [token.lemma_ for token in doc if token.text.lower() not in stop_words]
        
        i = ' '.join(tokens)
        
        text.append(i)
        
    return text

news_dataset['preprocessed_news'] = preprocessing(news_dataset['Combined_news'])


In [16]:
news_dataset

Unnamed: 0,value,Combined_news,preprocessed_news
0,0,Donald Trump Sends Out Embarrassing New Year’...,donald trump send embarrass new year eve messa...
1,0,Drunk Bragging Trump Staffer Started Russian ...,drunk bragging trump staffer start russian col...
2,0,Sheriff David Clarke Becomes An Internet Joke...,sheriff david clarke become internet joke thre...
3,0,Trump Is So Obsessed He Even Has Obama’s Name...,trump obsessed even obama name code website im...
4,0,Pope Francis Just Called Out Donald Trump Dur...,pope francis call donald trump christmas speec...
...,...,...,...
44893,1,'Fully committed' NATO backs new U.S. approach...,fully commit nato back new u approach afghanis...
44894,1,LexisNexis withdrew two products from Chinese ...,lexisnexis withdraw two product chinese market...
44895,1,Minsk cultural hub becomes haven from authorit...,minsk cultural hub become authoritiesminsk reu...
44896,1,Vatican upbeat on possibility of Pope Francis ...,vatican upbeat possibility pope francis visit ...


In [17]:
feature = news_dataset['preprocessed_news']
target = news_dataset['value']



In [18]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(feature, target, test_size=0.20, random_state=32)

In [19]:
X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=7000)

tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

2024-03-29 10:16:07.788942: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 10:16:07.789067: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 10:16:07.920941: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [20]:
max_len = max([len(i) for i in X_train ])
max_len

4487

In [21]:
import tensorflow as tf
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=512)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=512)

In [22]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(7000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(512,1))
model.summary()


In [23]:
history = model.fit(X_train, Y_train, epochs=20,validation_split=0.1, batch_size=30, shuffle=True)

Epoch 1/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 67ms/step - accuracy: 0.9479 - loss: 0.1006 - val_accuracy: 0.9914 - val_loss: 0.0327
Epoch 2/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 67ms/step - accuracy: 0.9950 - loss: 0.0160 - val_accuracy: 0.9958 - val_loss: 0.0110
Epoch 3/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 67ms/step - accuracy: 0.9977 - loss: 0.0102 - val_accuracy: 0.9969 - val_loss: 0.0075
Epoch 4/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 67ms/step - accuracy: 0.9987 - loss: 0.0044 - val_accuracy: 0.9950 - val_loss: 0.0165
Epoch 5/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 67ms/step - accuracy: 0.9995 - loss: 0.0017 - val_accuracy: 0.9972 - val_loss: 0.0096
Epoch 6/20
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 69ms/step - accuracy: 0.9994 - loss: 0.0021 - val_accuracy: 0.9967 - val_loss: 0.0182
Epoc