<a href="https://colab.research.google.com/github/Shreenidhi-Kovai-Sivabalan/Fake-News-Detection/blob/main/MainCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing Coursework

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets
from datasets import load_dataset
import pandas as pd

In [None]:
# Loading the dataset
data = load_dataset('ErfanMoosaviMonazzah/fake-news-detection-dataset-English')

# Splitting the dataset as training set, validation set and test set
data_train = pd.DataFrame(data['train'])
data_val = pd.DataFrame(data['validation'])
data_test = pd.DataFrame(data['test'])

In [None]:
# Size of each set
print(f'Train data shape: {data_train.shape}')
print(f'Validation data shape: {data_val.shape}')
print(f'Test data shape: {data_test.shape}')

In [None]:
print(f"Train size: {len(data_train)}")
print(f"Validation size: {len(data_val)}")
print(f"Test size: {len(data_test)}")

In [None]:
data_train.head()

Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Checking missing values
print(f"Missing values in train data:\n{data_train.isnull().sum()}")

In [None]:
# Class Distribution in Train Data
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=data_train)
plt.title('Class Distribution in Train Data')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Real', 'Fake'], rotation=45)
plt.show()

In [None]:
class_counts = data_train['label'].value_counts()
class_counts

# The dataset is not imbalanced so class balancing techniques like
# SMOTE, BorderlineSMOTE, ADASYN don't have to applied.

In [None]:
# Analyse test length - number of words
data_train['text_length'] = data_train['text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(8,6))
sns.histplot(data_train['text_length'], bins=30, kde=True)
plt.title('Distribution of Text Length in Train Data')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Sample text for real and fake news articles
print('Sample real news article:\n')
print(data_train[data_train['label'] == 0]['text'].iloc[0])
print('\nSample fake news article:\n')
print(data_train[data_train['label'] == 1]['text'].iloc[0])

Preprocessing

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def preprocessing_text(text):
  text = text.lower() # converting the text into lower case
  text = re.sub(r'[^a-z\s]', '', text) # removing unwanted characters - punctuation, numbers, speacial characters
  tokens = word_tokenize(text) # tokenisation
  filtered_tokens = [token for token in tokens if token not in stop_words] # removing stop words
  cleaned_text = ' '.join(filtered_tokens)
  return cleaned_text

In [None]:
# Applying the preprocessing steps to the dataset
data_train['cleaned_text'] = data_train['text'].apply(preprocessing_text)
data_val['cleaned_text'] = data_val['text'].apply(preprocessing_text)
data_test['cleaned_text'] = data_test['text'].apply(preprocessing_text)

In [None]:
# Sample of Preprocessed Real News Article
print('Sample preprocessed real news article:\n')
data_train[data_train['label'] == 0].iloc[0]['cleaned_text']

In [None]:
# Sample of Preprocessed Fake News Article
print('\nSample preprocessed fake news article:\n')
data_train[data_train['label'] == 1].iloc[0]['cleaned_text']

In [None]:
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.util import ngrams

In [None]:
# Visualising the word cloud
all_words = ' '.join(data_train['cleaned_text'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud - All Training Articles")
plt.show()

In [None]:
# Frequently occurring words
tokens = nltk.word_tokenize(all_words)
unigram_freq = Counter(tokens)
common_unigrams = unigram_freq.most_common(20)
unigrams_df = pd.DataFrame(common_unigrams, columns=['Unigram', 'Frequency'])

plt.figure(figsize=(10,5))
sns.barplot(x='Frequency', y='Unigram', data=unigrams_df)
plt.title("Top 20 Unigrams")
plt.show()

In [None]:
# Frequently occurring bigrams
bigram_freq = Counter(ngrams(tokens, 2))
common_bigrams = bigram_freq.most_common(20)
bigrams_df = pd.DataFrame(common_bigrams, columns=['Bigram', 'Frequency'])
bigrams_df['Bigram'] = bigrams_df['Bigram'].apply(lambda x: ' '.join(x))

plt.figure(figsize=(10,5))
sns.barplot(x='Frequency', y='Bigram', data=bigrams_df)
plt.title("Top 20 Bigrams")
plt.show()

In [None]:
# Frequently occurring trigrams
trigram_freq = Counter(ngrams(tokens, 3))
common_trigrams = trigram_freq.most_common(20)
trigrams_df = pd.DataFrame(common_trigrams, columns=['Trigram', 'Frequency'])
trigrams_df['Trigram'] = trigrams_df['Trigram'].apply(lambda x: ' '.join(x))

plt.figure(figsize=(10,5))
sns.barplot(x='Frequency', y='Trigram', data=trigrams_df)
plt.title("Top 20 Trigrams")
plt.show()

In [None]:
# Word Cloud for Real News
real_news_text = ' '.join(data_train[data_train['label'] == 0]['cleaned_text'])

real_news_wordcloud = WordCloud(width = 800, height = 400, background_color = 'white').generate(real_news_text)
plt.figure(figsize = (12, 6))
plt.imshow(real_news_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Word Cloud for Real News Articles')
plt.show()

In [None]:
# Word Cloud for Fake News
fake_news_text = ' '.join(data_train[data_train['label'] == 1]['cleaned_text'])

fake_news_wordcloud = WordCloud(width = 800, height = 400, background_color = 'white').generate(fake_news_text)
plt.figure(figsize = (12, 6))
plt.imshow(fake_news_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Word Cloud for Fake News Articles')
plt.show()

In [None]:
# Text length distributions by class
data_train['text_length'] = data_train['cleaned_text'].apply(lambda x: len(x.split()))

plt.figure(figsize = (10, 5))
sns.histplot(data = data_train, x = 'text_length', hue = 'label', kde = True)
plt.title('Distribution of Text Length by Class')
plt.xlabel('NUmber of Words')
plt.ylabel('Frequency')
plt.legend(title = 'Label', labels = ['Real', 'Fake'])
plt.show()

TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorisation
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
X_train_tfidf = tfidf.fit_transform(data_train['cleaned_text'])
X_val_tfidf = tfidf.transform(data_val['cleaned_text'])
X_test_tfidf = tfidf.transform(data_test['cleaned_text'])

In [None]:
y_train = data_train['label']
y_val = data_val['label']
y_test = data_test['label']

Logistic Regression (for baseline) on TF-IDF

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Training Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

In [None]:
# Predicting on Validation set
y_pred_val_tfidf = log_reg.predict(X_val_tfidf)

In [None]:
# Evaluation
print('Validation Data Evaluation (Logistic Regression):\n')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val_tfidf)}')
print('\nClassification Report:\n')
print(classification_report(y_val, y_pred_val_tfidf))

In [None]:
# Confusion Matrix
cmat = confusion_matrix(y_val, y_pred_val_tfidf)
cmat

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix on Validation Data (Log Reg)')
plt.show()

Adding Bigrams and Trigrams in TF-IDF

In [None]:
# TF-IDF with Bigrams and Trigrams
tfidf_ngram = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
# ngram_range=(1,3) means unigrams, bigrams and trigrams

X_train_ngram = tfidf_ngram.fit_transform(data_train['cleaned_text'])
X_val_ngram = tfidf_ngram.transform(data_val['cleaned_text'])
X_test_ngram = tfidf_ngram.transform(data_test['cleaned_text'])

Logistic Regression on TF-IDF with Bigram and Trigram

In [None]:
# Training Logistic Regression on TF-IDF N-grams data
log_reg.fit(X_train_ngram, y_train)

In [None]:
# Predicting on Validation Set
y_pred_val_ngram = log_reg.predict(X_val_ngram)

In [None]:
# Evaluate the model
print('Validation Data Evaluation (Log Reg with Bigrams/Trigrams):\n')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val_ngram)}')
print('\nClassification Report:\n')
print(classification_report(y_val, y_pred_val_ngram))

In [None]:
# Confusion Matrix
cmat_ngram = confusion_matrix(y_val, y_pred_val_ngram)
cmat_ngram

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat_ngram, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix on Validation Set (n-grams)')
plt.show()

In [None]:
# Running on test data

y_test_pred_ngram = log_reg.predict(X_test_ngram)

print('Test Set Evaluation (Log Reg w/ n-grams):\n')
print(f'Accuracy: {accuracy_score(y_test, y_test_pred_ngram)}')
print("\nTest Set Performance (with Bigrams/Trigrams):")
print(classification_report(y_test, y_test_pred_ngram))
print()

In [None]:
#Confusion Matrix for test data
cmat_test = confusion_matrix(y_test, y_test_pred_ngram)
cmat_test

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat_test, annot=True, fmt='d', cmap='Purples', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix on Test Set (n-grams)')
plt.show()

SVM

In [None]:
from sklearn.svm import LinearSVC

# Training SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_ngram, y_train)

In [None]:
# Predicting on val set
y_pred_val_svm = svm_model.predict(X_val_ngram)

In [None]:
print('Validation Data Evaluation (SVM):\n')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val_svm)}')
print('\nClassification Report:\n')
print(classification_report(y_val, y_pred_val_svm))

In [None]:
# Confusion matrix for validation set
cmat_svm = confusion_matrix(y_val, y_pred_val_svm)
cmat_svm

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat_svm, annot=True, fmt='d', cmap='Greens', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - SVM (Validation Set)')
plt.show()

In [None]:
# Predicting on test set
y_pred_test_svm = svm_model.predict(X_test_ngram)

In [None]:
print('Test Data Evaluation (SVM):\n')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test_svm)}')
print('\nClassification Report:\n')
print(classification_report(y_test, y_pred_test_svm))

In [None]:
# Confusion matrix for test set
cmat_svm_test = confusion_matrix(y_test, y_pred_test_svm)
cmat_svm_test

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat_svm_test, annot=True, fmt='d', cmap='Greens', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - SVM (Test Set)')
plt.show()

Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Training the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_ngram, y_train)

In [None]:
# Predicting on Validation set
y_pred_val_nb = nb_model.predict(X_val_ngram)

In [None]:
print('Validation Data Evaluation (Naive Bayes):\n')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val_nb)}')
print('\nClassification Report:\n')
print(classification_report(y_val, y_pred_val_nb))

In [None]:
# Confusion Matrix
cmat_nb = confusion_matrix(y_val, y_pred_val_nb)
cmat_nb

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cmat_nb, annot=True, fmt='d', cmap='Oranges', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Na√Øve Bayes (Validation Set)')
plt.show()

Preparing Data for LSTM

In [None]:
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Maximum vocabulary size (number of unique words to consider)
VOCAB_SIZE = 10000
# How long each input will be pad/cut
MAX_SEQ_LEN = 300

In [None]:
#Initiallising tokeniser
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data_train['cleaned_text'])

In [None]:
# Texts to sequences
X_train_seq = tokenizer.texts_to_sequences(data_train['cleaned_text'])
X_val_seq = tokenizer.texts_to_sequences(data_val['cleaned_text'])
X_test_seq = tokenizer.texts_to_sequences(data_test['cleaned_text'])

In [None]:
# Pad sequences to same length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

In [None]:
y_train = data_train['label']
y_val = data_val['label']
y_test = data_test['label']

In [None]:
# Sizes of the pad sequence datasets
print(f"Training set shape: {X_train_pad.shape}")
print(f"Validation set shape: {X_val_pad.shape}")
print(f"Test set shape: {X_test_pad.shape}")

Building and Training the LSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# Defining the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, input_length=MAX_SEQ_LEN),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])


In [None]:
# Model summary
lstm_model.summary()

In [None]:
# Training the model
history = lstm_model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(X_val_pad, y_val)
)

In [None]:
# Evaluation of LSTM on Test Set
test_loss, test_accuracy = lstm_model.evaluate(X_test_pad, y_test, verbose=2)

print("LSTM Performance: ")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Bidirectional

# Defining the model
bilstm_model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, input_length=MAX_SEQ_LEN),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

#Compile
bilstm_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [None]:
# Model summary
bilstm_model.summary()

In [None]:
# Training the model
history_bilstm = bilstm_model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(X_val_pad, y_val)
)

In [None]:
# Evaluating Bidirectional LSTM on test set
test_loss_bilstm, test_accuracy_bilstm = bilstm_model.evaluate(X_test_pad, y_test, verbose=2)

print("Bidirectional LSTM Performance: ")
print(f"Test Accuracy: {test_accuracy_bilstm:.4f}")
print(f"Test Loss: {test_loss_bilstm:.4f}")


Word2Vec with LSTM

In [None]:
# Downloading GloVe 100D
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
# Loading GloVe into a dictionary
embedding_idx = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
  for line in f:
    vals = line.split()
    word = vals[0]
    vector = np.asarray(vals[1:], dtype='float32')
    embedding_idx[word] = vector

print(f'Loaded {len(embedding_idx)} word vectors from GloVe')

In [None]:
# Creating Embedding Matrix

# dimensions
EMBEDDING_DIM = 100
word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)

# Initialising matrix with zeros
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

# Filling matrix with GloVe vectors
for word, i in word_index.items():
  if i < num_words:
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [None]:
# Building LSTM with GloVe - Frozen

lstm_glove_frozen = Sequential([
    Embedding(input_dim = num_words,
              output_dim = EMBEDDING_DIM,
              weights = [embedding_matrix],
              input_length = MAX_SEQ_LEN,
              trainable = False),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compiling the model
lstm_glove_frozen.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Training the model
history_glove = lstm_glove_frozen.fit(
    X_train_pad, y_train,
    epochs = 5,
    batch_size = 128,
    validation_data = (X_val_pad, y_val)
)

In [None]:
# Evaluating on test set
test_loss_glove, test_acc_glove = lstm_glove_frozen.evaluate(X_test_pad, y_test, verbose = 2)

print("GloVe LSTM (Frozen) Performance: ")
print(f'Test Accuracy: {test_acc_glove:.4f}')
print(f'Test Loss: {test_loss_glove:.4f}')

In [None]:
# Building LSTM with GloVe - Trainable

lstm_glove_trainable = Sequential([
    Embedding(input_dim = num_words,
              output_dim = EMBEDDING_DIM,
              weights = [embedding_matrix],
              input_length = MAX_SEQ_LEN,
              trainable = True),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compiling the model
lstm_glove_trainable.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Training the model
history_glove1 = lstm_glove_trainable.fit(
    X_train_pad, y_train,
    epochs = 5,
    batch_size = 128,
    validation_data = (X_val_pad, y_val)
)

In [None]:
# Evaluating on test set
test_loss_glove, test_acc_glove = lstm_glove_trainable.evaluate(X_test_pad, y_test, verbose = 2)

print("GloVe LSTM (Trainable) Performance: ")
print(f'Test Accuracy: {test_acc_glove:.4f}')
print(f'Test Loss: {test_loss_glove:.4f}')

GloVe + BiLSTM

In [None]:
# Defining Bidirectional LSTM with GloVe - Frozen
bilstm_glove_frozen = Sequential([
    Embedding(
        input_dim = num_words,
        output_dim = EMBEDDING_DIM,
        weights = [embedding_matrix],
        input_length = MAX_SEQ_LEN,
        trainable = False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')

])

In [None]:
# Compile
bilstm_glove_frozen.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Training the model
history_bilstm_glove = bilstm_glove_frozen.fit(
    X_train_pad, y_train,
    epochs = 5,
    batch_size = 128,
    validation_data = (X_val_pad, y_val)
)

In [None]:
# Evaluate the model on test set
test_loss_bilstm_glove, test_acc_bilstm_glove = bilstm_glove_frozen.evaluate(X_test_pad, y_test, verbose = 2)

print("BiLSTM + GloVe (Frozen) Performance: ")
print(f'Test Accuracy: {test_acc_bilstm_glove:.4f}')
print(f'Test Loss: {test_loss_bilstm_glove:.4f}')


In [None]:
# Defining Bidirectional LSTM with GloVe - Trainable
bilstm_glove_trainable = Sequential([
    Embedding(
        input_dim = num_words,
        output_dim = EMBEDDING_DIM,
        weights = [embedding_matrix],
        input_length = MAX_SEQ_LEN,
        trainable = True),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')

])

In [None]:
# Compile
bilstm_glove_trainable.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Training the model
history_bilstm_glove = bilstm_glove_trainable.fit(
    X_train_pad, y_train,
    epochs = 5,
    batch_size = 128,
    validation_data = (X_val_pad, y_val)
)

In [None]:
# Evaluate the model on test set
test_loss_bilstm_glove, test_acc_bilstm_glove = bilstm_glove_trainable.evaluate(X_test_pad, y_test, verbose = 2)

print("BiLSTM + GloVe (Trainable) Performance: ")
print(f'Test Accuracy: {test_acc_bilstm_glove:.4f}')
print(f'Test Loss: {test_loss_bilstm_glove:.4f}')


**Error Analysis**

In [None]:
# Function for printing confusion matrix
def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()


In [None]:
# Predict with SVM
y_pred_svm = svm_model.predict(X_test_ngram)

# Plot confusion matrix
plot_conf_matrix(y_test, y_pred_svm, title="Confusion Matrix - SVM (TF-IDF with n-gram)")


In [None]:
# Predict with BiLSTM
y_pred_bilstm_glove = (bilstm_glove_trainable.predict(X_test_pad) > 0.5).astype("int32").flatten()

# Plot confusion matrix
plot_conf_matrix(y_test, y_pred_bilstm_glove, title="Confusion Matrix - GloVe + BiLSTM (Trainable)")


In [None]:
def print_misclassified_examples(y_true, y_pred, texts, max_examples=25, model_name="Model"):
    errors = (y_true != y_pred)
    misclassified_indices = [i for i, wrong in enumerate(errors) if wrong]
    print(f"=== Misclassified Examples ({model_name}) ===")
    for i in misclassified_indices[:max_examples]:
        print(f"\n--- Example #{i} ---")
        print(f"Actual Label: {y_true[i]} | Predicted Label: {y_pred[i]}")
        print(f"Text:\n{texts.iloc[i][:200]}...")  # Shortened output for readability


In [None]:
print_misclassified_examples(y_test, y_pred_svm, data_test['text'], model_name="SVM (TF-IDF with N-Grams)")

In [None]:
print_misclassified_examples(y_test, y_pred_bilstm_glove, data_test['text'], model_name="GloVe + BiLSTM (Trainable)")

Saving the models

In [None]:
import joblib

joblib.dump(tfidf_ngram, '/content/drive/MyDrive/NLP_Coursework/tfidf_ngram.pkl')
joblib.dump(svm_model, '/content/drive/MyDrive/NLP_Coursework/svm_model.pkl')

In [None]:
bilstm_glove_trainable.save('/content/drive/MyDrive/NLP_Coursework/bilstm_glove_trainable.h5')

In [None]:
import pickle

with open('/content/drive/MyDrive/NLP_Coursework/tokenizer.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)

In [None]:
with open('/content/drive/MyDrive/NLP_Coursework/max_len.txt', 'w') as f:
  f.write(str(MAX_SEQ_LEN))