In [21]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import random

In [22]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [30]:
# Load the preprocessed data from CSV files
train_data = pd.read_csv("../train.csv")
test_data = pd.read_csv("../WELFake_Dataset.csv")

X_train = train_data['text']
y_train = train_data['label']


X_test = test_data['text']
X_test = pd.Series(X_test).fillna("").tolist()
y_test = test_data['label']

In [24]:
# Load pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)


In [25]:
# Create a vocabulary
embedding_dim = 300
vocab = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
embedding_matrix = [np.zeros(embedding_dim), np.random.uniform(-0.01, 0.01, embedding_dim)]  # Initialize <PAD> and <UNK>

In [26]:
# Build vocabulary from Word2Vec
for text in X_train:
    for word in text.split():
        if word not in vocab and word in word2vec:
            vocab[word] = len(vocab)
            embedding_matrix.append(word2vec[word])

embedding_matrix = np.array(embedding_matrix)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 76131


In [31]:
# Tokenize and convert text to sequences
def text_to_sequence(text, vocab, max_len=1000):
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    return sequence[:max_len]

# Apply tokenization
max_len = 1000
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]

In [32]:
def augment_text(text):
    words = text.split()
    # Randomly drop some words (with 20% probability)
    words = [w for w in words if random.random() > 0.2]
    # If we dropped all words (unlikely but possible), return original text
    if not words:
        return text
    return ' '.join(words)

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, is_training=False):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)
        self.is_training = is_training
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        if self.is_training:
            # Get the original sequence
            sequence = self.texts[idx].tolist()  # Convert tensor to list
            # Remove padding tokens for augmentation
            sequence = [x for x in sequence if x != vocab["<PAD>"]]
            # Augment
            if len(sequence) > 0:  # Only augment if we have tokens
                sequence = [x for x in sequence if random.random() > 0.2]
            # Re-pad the sequence
            if len(sequence) < max_len:
                sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
            sequence = sequence[:max_len]
            # Convert back to tensor
            text = torch.tensor(sequence, dtype=torch.long)
        else:
            text = self.texts[idx]
            
        return text, self.labels[idx]

In [33]:
# Create Dataset and DataLoader
batch_size = 32
test_dataset = TextDataset(X_test_seq, y_test, is_training=False)

test_loader = DataLoader(test_dataset, batch_size=32)

In [34]:
class MLPModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dims=[512, 256, 128], output_dim=1):
        super(MLPModel, self).__init__()
        
        # Embedding Layer with frozen weights
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,
            padding_idx=0
        )
        
        # Calculate input dimension
        input_dim = embedding_matrix.shape[1] * max_len
        
        # Create list to hold all layers
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dims[0]))
        layers.append(nn.LayerNorm(hidden_dims[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(0.5))
        
        # Hidden layers
        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.LayerNorm(hidden_dims[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
        
        # Output layer
        layers.append(nn.Linear(hidden_dims[-1], output_dim))
        
        # Combine all layers
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        # Get embeddings and flatten
        embedded = self.embedding(x)
        flattened = embedded.view(embedded.size(0), -1)
        
        # Forward pass through all layers
        return self.model(flattened)

In [35]:
# Load the saved model
model = MLPModel(
                embedding_matrix=embedding_matrix,
                 hidden_dims=[256, 128,64],
                output_dim=1
                ).to(device)
model.load_state_dict(torch.load('best_mlp_model.pth'))


  model.load_state_dict(torch.load('best_mlp_model.pth'))


<All keys matched successfully>

## Test the model

In [36]:
model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts).squeeze(1)  # Ensure outputs have the same shape as labels
        preds = torch.round(torch.sigmoid(outputs)).cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)
    print(f"Test Accuracy: {test_acc}")

Test Accuracy: 0.8686611029472925
