In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import random

from MLPModel import MLPModel

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load the preprocessed data from CSV files
new_data = pd.read_csv("../WELFake_Dataset.csv")

# split the new data into train and test
new_train_data, new_test_data = train_test_split(new_data, test_size=0.2)

X_train = new_train_data['text']
X_train = pd.Series(X_train).fillna("").tolist()
y_train = new_train_data['label']

X_test = new_test_data['text']
X_test = pd.Series(X_test).fillna("").tolist()
y_test = new_test_data['label']

In [4]:
# Load pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin.gz", binary=True)


In [5]:
# Create a vocabulary
# Load the training vocabulary and embedding matrix
trained_state = torch.load('fine_tuned_model.pth')
original_embedding_weights = trained_state['embedding.weight']
vocab_size = original_embedding_weights.shape[0]

# Create vocabulary with same words as original model
embedding_dim = 300
vocab = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
embedding_matrix = [np.zeros(embedding_dim), np.random.uniform(-0.01, 0.01, embedding_dim)]

# First add all words from original embedding matrix
for i in range(2, vocab_size):  # Skip PAD and UNK tokens
    vocab[f"word_{i}"] = len(vocab)
    embedding_matrix.append(original_embedding_weights[i].numpy())

embedding_matrix = np.array(embedding_matrix)

# Build vocabulary from Word2Vec
for text in X_train:
    for word in text.split():
        if word not in vocab and word in word2vec:
            vocab[word] = len(vocab)
            embedding_matrix.append(word2vec[word])

embedding_matrix = np.array(embedding_matrix)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")

  trained_state = torch.load('fine_tuned_model.pth')


MemoryError: Unable to allocate 465. MiB for an array with shape (203269, 300) and data type float64

In [None]:
# Tokenize and convert text to sequences
def text_to_sequence(text, vocab, max_len=1000):
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    return sequence[:max_len]

# Apply tokenization
max_len = 1000
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]

In [None]:
def augment_text(text):
    words = text.split()
    # Randomly drop some words (with 20% probability)
    words = [w for w in words if random.random() > 0.2]
    # If we dropped all words (unlikely but possible), return original text
    if not words:
        return text
    return ' '.join(words)

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, is_training=False):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)
        self.is_training = is_training
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        if self.is_training:
            # Get the original sequence
            sequence = self.texts[idx].tolist()  # Convert tensor to list
            # Remove padding tokens for augmentation
            sequence = [x for x in sequence if x != vocab["<PAD>"]]
            # Augment
            if len(sequence) > 0:  # Only augment if we have tokens
                sequence = [x for x in sequence if random.random() > 0.2]
            # Re-pad the sequence
            if len(sequence) < max_len:
                sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
            sequence = sequence[:max_len]
            # Convert back to tensor
            text = torch.tensor(sequence, dtype=torch.long)
        else:
            text = self.texts[idx]
            
        return text, self.labels[idx]

In [None]:
# Create Dataset and DataLoader
batch_size = 32
test_dataset = TextDataset(X_test_seq, y_test, is_training=False)

test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Create model with the same embedding matrix as training
model = MLPModel(
    embedding_matrix=embedding_matrix,
    hidden_dims=[256, 128, 64],
    output_dim=1
).to(device)

# Load the trained weights
model.load_state_dict(torch.load('fine_tuned_model.pth'))

## Test the model

In [None]:
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]
test_dataset = TextDataset(X_test_seq, y_test, is_training=False)
test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts).squeeze(1)  # Ensure outputs have the same shape as labels
        preds = torch.round(torch.sigmoid(outputs)).cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)
    print(f"Test Accuracy: {test_acc}")