<a href="https://colab.research.google.com/github/Shreenidhi-Kovai-Sivabalan/Fake-News-Detection/blob/main/TwoBestModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Coursework - Two Best Models (SVM and BiLSTM)

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets
from datasets import load_dataset
import pandas as pd

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Loading the dataset
data = load_dataset('ErfanMoosaviMonazzah/fake-news-detection-dataset-English')

# Splitting the dataset as training set, validation set and test set
data_train = pd.DataFrame(data['train'])
data_val = pd.DataFrame(data['validation'])
data_test = pd.DataFrame(data['test'])

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stop_words = set(stopwords.words('english'))

def preprocessing_text(text):
  text = text.lower() # converting the text into lower case
  text = re.sub(r'[^a-z\s]', '', text) # removing unwanted characters - punctuation, numbers, speacial characters
  tokens = word_tokenize(text) # tokenisation
  filtered_tokens = [token for token in tokens if token not in stop_words] # removing stop words
  cleaned_text = ' '.join(filtered_tokens)
  return cleaned_text

In [None]:
# Applying the preprocessing steps to the dataset
data_train['cleaned_text'] = data_train['text'].apply(preprocessing_text)
data_val['cleaned_text'] = data_val['text'].apply(preprocessing_text)
data_test['cleaned_text'] = data_test['text'].apply(preprocessing_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF with Bigrams and Trigrams
tfidf_ngram = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
# ngram_range=(1,3) means unigrams, bigrams and trigrams

X_train_ngram = tfidf_ngram.fit_transform(data_train['cleaned_text'])
X_val_ngram = tfidf_ngram.transform(data_val['cleaned_text'])
X_test_ngram = tfidf_ngram.transform(data_test['cleaned_text'])

y_train = data_train['label']
y_val = data_val['label']
y_test = data_test['label']

**SVM**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Training SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_ngram, y_train)

In [None]:
# Predicting on val set
y_pred_val_svm = svm_model.predict(X_val_ngram)

print('Validation Data Evaluation (SVM):\n')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val_svm)}')
print('\nClassification Report:\n')
print(classification_report(y_val, y_pred_val_svm))

Validation Data Evaluation (SVM):

Accuracy: 0.9911666666666666

Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3089
           1       0.99      0.99      0.99      2911

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000



In [None]:
# Predicting on test set
y_pred_test_svm = svm_model.predict(X_test_ngram)

print('Test Data Evaluation (SVM):\n')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test_svm)}')
print('\nClassification Report:\n')
print(classification_report(y_test, y_pred_test_svm))

Test Data Evaluation (SVM):

Accuracy: 0.992500302407161

Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4284
           1       0.99      0.99      0.99      3983

    accuracy                           0.99      8267
   macro avg       0.99      0.99      0.99      8267
weighted avg       0.99      0.99      0.99      8267



BiLSTM (GloVe - Trainable)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Maximum vocabulary size (number of unique words to consider)
VOCAB_SIZE = 10000
# How long each input will be pad/cut
MAX_SEQ_LEN = 300

#Initiallising tokeniser
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data_train['cleaned_text'])

In [None]:
# Texts to sequences
X_train_seq = tokenizer.texts_to_sequences(data_train['cleaned_text'])
X_val_seq = tokenizer.texts_to_sequences(data_val['cleaned_text'])
X_test_seq = tokenizer.texts_to_sequences(data_test['cleaned_text'])

# Pad sequences to same length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

In [None]:
# Downloading GloVe 100D
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-05-03 09:57:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-03 09:57:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-03 09:57:54--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Loading GloVe into a dictionary
embedding_idx = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
  for line in f:
    vals = line.split()
    word = vals[0]
    vector = np.asarray(vals[1:], dtype='float32')
    embedding_idx[word] = vector

print(f'Loaded {len(embedding_idx)} word vectors from GloVe')

Loaded 400000 word vectors from GloVe


In [None]:
# Creating Embedding Matrix

# dimensions
EMBEDDING_DIM = 100
word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)

# Initialising matrix with zeros
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

# Filling matrix with GloVe vectors
for word, i in word_index.items():
  if i < num_words:
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional

# Defining Bidirectional LSTM with GloVe - Trainable
bilstm_glove_trainable = Sequential([
    Embedding(
        input_dim = num_words,
        output_dim = EMBEDDING_DIM,
        weights = [embedding_matrix],
        input_length = MAX_SEQ_LEN,
        trainable = True),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')

])

# Compile
bilstm_glove_trainable.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])



In [None]:
# Training the model
history_bilstm_glove = bilstm_glove_trainable.fit(
    X_train_pad, y_train,
    epochs = 5,
    batch_size = 128,
    validation_data = (X_val_pad, y_val)
)

Epoch 1/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 46ms/step - accuracy: 0.8919 - loss: 0.2601 - val_accuracy: 0.9693 - val_loss: 0.0878
Epoch 2/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - accuracy: 0.9765 - loss: 0.0721 - val_accuracy: 0.9805 - val_loss: 0.0685
Epoch 3/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 47ms/step - accuracy: 0.9909 - loss: 0.0308 - val_accuracy: 0.9810 - val_loss: 0.0578
Epoch 4/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 47ms/step - accuracy: 0.9944 - loss: 0.0173 - val_accuracy: 0.9622 - val_loss: 0.1403
Epoch 5/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 47ms/step - accuracy: 0.9938 - loss: 0.0187 - val_accuracy: 0.9815 - val_loss: 0.0650


In [None]:
# Evaluate the model on test set
test_loss_bilstm_glove, test_acc_bilstm_glove = bilstm_glove_trainable.evaluate(X_test_pad, y_test, verbose = 2)

print("BiLSTM + GloVe (Trainable) Performance: ")
print(f'Test Accuracy: {test_acc_bilstm_glove:.4f}')
print(f'Test Loss: {test_loss_bilstm_glove:.4f}')

259/259 - 3s - 10ms/step - accuracy: 0.9823 - loss: 0.0608
BiLSTM + GloVe (Trainable) Performance: 
Test Accuracy: 0.9823
Test Loss: 0.0608
