Fake News Classifier Using LSTM and BidirectionnalRNN

Dataset: https://www.kaggle.com/c/fake-news/data#

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Mount your Google Drive if you're storing data there (optional)
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Set the path to your CSV file (replace with your actual path)
data_path = '/content/drive/MyDrive/train.csv'

# Read the CSV data using pandas
df = pd.read_csv(data_path)

In [None]:
# Check data shape and missing values
print(df.shape)
print(df.isnull().sum())

# Display the first few rows
print(df.head())

# Drop rows with missing values (optional, consider imputation techniques if necessary)
# df = df.dropna()

# Separate features (X) and labels (y)
X = df.drop('label', axis=1)
y = df['label']

(20800, 5)
id           0
title      558
author    1957
text        39
label        0
dtype: int64
   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [None]:

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (20800, 4)
y shape: (20800,)


In [None]:
# Preprocess text data
stop_words = stopwords.words('english')
ps = PorterStemmer()


In [None]:
import re
def preprocess_text(text):
  """Preprocesses text data by removing punctuation, converting to lowercase,
     removing stop words, and stemming."""
  if isinstance(text, str):  # Check if text is a string
      text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
      text = text.lower()
      text = text.split()
      text = [ps.stem(word) for word in text if word not in stop_words]
      return ' '.join(text)
  else:
      return ""  # Handle non-string values by returning an empty string

X['title'] = X['title'].apply(preprocess_text)

In [None]:
# Create a tokenizer and fit it on the preprocessed text
tokenizer = Tokenizer(num_words=5000)  # Adjust vocabulary size as needed
tokenizer.fit_on_texts(X['title'])

# Convert text to sequences of integer indices
sequences = tokenizer.texts_to_sequences(X['title'])


In [None]:
# Pad sequences to a fixed length (20 in this case)
sent_length = 20
embedded_docs = pad_sequences(sequences, maxlen=sent_length, padding='pre')


In [None]:
# Define the LSTM model architecture
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(5000, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])


In [None]:
# Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embedded_docs, y, test_size=0.33, random_state=42)


In [None]:

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78ece025a770>

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Threshold for classification




In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9265734265734266


In [None]:
# Print confusion matrix and classification report (optional)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3203  246]
 [ 258 3157]]
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      3449
           1       0.93      0.92      0.93      3415

    accuracy                           0.93      6864
   macro avg       0.93      0.93      0.93      6864
weighted avg       0.93      0.93      0.93      6864



# Define the Bidirectional LSTM model architecture


In [None]:

# Define the Bidirectional LSTM model architecture
embedding_vector_features = 40
model1 = Sequential()
model1.add(Embedding(5000, embedding_vector_features, input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))  # Bidirectional LSTM layer
model1.add(Dropout(0.3))  # Dropout for regularization
model1.add(Dense(1, activation='sigmoid'))

# Compile the model
model1.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])

# Print model summary
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 200)               112800    
 onal)                                                           
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 201       
                                                                 
Total params: 313001 (1.19 MB)
Trainable params: 313001 (1.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embedded_docs, y, test_size=0.33, random_state=42)


In [None]:
# Train the Bidirectional LSTM model
model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78ecd2527c70>

In [None]:
# Make predictions on the test set with the Bidirectional LSTM model
y_pred1 = model1.predict(X_test)
y_pred1 = (y_pred1 > 0.5).astype(int)  # Threshold for classification




In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred1)
print('Accuracy (Bidirectional LSTM):', accuracy)

# Print confusion matrix and classification report (optional)
print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

Accuracy (Bidirectional LSTM): 0.9243881118881119
[[3124  325]
 [ 194 3221]]
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      3449
           1       0.91      0.94      0.93      3415

    accuracy                           0.92      6864
   macro avg       0.92      0.92      0.92      6864
weighted avg       0.93      0.92      0.92      6864

