## Step 1: Load the Required Libraries
In this step, we import the necessary libraries, including PyTorch for building the model,
and other utilities for data preprocessing, loading, and splitting.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors 

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Step 2: Load and Preprocess the Dataset
Here, we load the dataset from the CSV files and preprocess it for training the model.


In [3]:
# Load the preprocessed data from CSV files
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("val.csv")
test_data = pd.read_csv("test.csv")

X_train = train_data['text']
y_train = train_data['label']

X_val = val_data['text']
y_val = val_data['label']

X_test = test_data['text']
y_test = test_data['label']

## Step 3: Load Pre-trained Word2Vec Embeddings
We use pre-trained Word2Vec embeddings to represent words as dense vectors.
These embeddings improve the performance of the model by leveraging semantic relationships between words.

In [4]:
# Load pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [5]:
# Create a vocabulary
embedding_dim = 300
vocab = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
embedding_matrix = [np.zeros(embedding_dim), np.random.uniform(-0.01, 0.01, embedding_dim)]  # Initialize <PAD> and <UNK>

In [6]:
# Build vocabulary from Word2Vec
for text in X_train:
    for word in text.split():
        if word not in vocab and word in word2vec:
            vocab[word] = len(vocab)
            embedding_matrix.append(word2vec[word])

embedding_matrix = np.array(embedding_matrix)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 76131


#$ Step 4: Tokenize and Pad Sequences
Convert the text into sequences of integers based on the vocabulary. We also pad sequences to ensure they all have the same length for batch processing.

In [7]:
# Tokenize and convert text to sequences
def text_to_sequence(text, vocab, max_len=200):
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    return sequence[:max_len]

# Apply tokenization
max_len = 200
X_train_seq = [text_to_sequence(text, vocab, max_len) for text in X_train]
X_val_seq = [text_to_sequence(text, vocab, max_len) for text in X_val]
X_test_seq = [text_to_sequence(text, vocab, max_len) for text in X_test]

## Step 5: Create a Dataset and DataLoader
We define a custom Dataset class to handle our data and create DataLoader objects
to efficiently load data during training and validation.

In [8]:
# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [9]:
# Create Dataset and DataLoader
batch_size = 32
train_dataset = TextDataset(X_train_seq, y_train)
val_dataset = TextDataset(X_val_seq, y_val)
test_dataset = TextDataset(X_test_seq, y_test)

## Step 6: Define the MLP Model
The model consists of an embedding layer initialized with Word2Vec embeddings,
followed by an layers, and a fully connected output layer.

In [10]:
class MLPModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dims=[512, 256, 128], output_dim=1):
        super(MLPModel, self).__init__()
        
        # Embedding Layer with frozen weights
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,
            padding_idx=0
        )
        
        # Calculate input dimension
        input_dim = embedding_matrix.shape[1] * max_len
        
        # Create list to hold all layers
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dims[0]))
        layers.append(nn.LayerNorm(hidden_dims[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(0.2))
        
        # Hidden layers
        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.LayerNorm(hidden_dims[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
        
        # Output layer
        layers.append(nn.Linear(hidden_dims[-1], output_dim))
        
        # Combine all layers
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        # Get embeddings and flatten
        embedded = self.embedding(x)
        flattened = embedded.view(embedded.size(0), -1)
        
        # Forward pass through all layers
        return self.model(flattened)

## Step 7: Train the Model
Train the model for multiple epochs and validate its performance on the validation set.

In [11]:
def train_model(model, train_loader, val_loader, epochs=15, learning_rate=1e-4):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=0.01
    )
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=2,
        verbose=True
    )
    
    best_val_acc = 0
    patience = 4
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i, (texts, labels) in enumerate(train_loader):
            texts = texts.to(device)
            # Ensure labels are float and proper shape
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            # Forward pass and ensure output shape matches labels
            outputs = model(texts).squeeze(-1)  # Change is here
            
            # Ensure shapes match
            if len(outputs.shape) == 0:
                outputs = outputs.unsqueeze(0)
            if len(labels.shape) == 0:
                labels = labels.unsqueeze(0)
                
            loss = criterion(outputs, labels)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            predicted = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            total_loss += loss.item()
            
            if i % 100 == 0:
                print(f'Epoch: {epoch}, Batch: {i}, Loss: {loss.item():.4f}')
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        
        # Validation phase
        val_acc = evaluate_model(model, val_loader)
        
        print(f'Epoch: {epoch}')
        print(f'Average Loss: {avg_loss:.4f}')
        print(f'Training Accuracy: {train_acc:.4f}')
        print(f'Validation Accuracy: {val_acc:.4f}')
        
        scheduler.step(val_acc)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_mlp_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print('Early stopping triggered')
                break

def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = texts.to(device)
            labels = labels.float().to(device)
            
            outputs = model(texts).squeeze(-1)
            
            if len(outputs.shape) == 0:
                outputs = outputs.unsqueeze(0)
            if len(labels.shape) == 0:
                labels = labels.unsqueeze(0)
                
            predicted = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return correct / total

In [12]:
# Initialize model
model = MLPModel(
    embedding_matrix=embedding_matrix,
    hidden_dims=[512, 256, 128],
    output_dim=1
).to(device)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Train model
train_model(model, train_loader, val_loader)



Epoch: 0, Batch: 0, Loss: 0.7503
Epoch: 0, Batch: 100, Loss: 0.3085
Epoch: 0, Batch: 200, Loss: 0.1219
Epoch: 0, Batch: 300, Loss: 0.1323
Epoch: 0, Batch: 400, Loss: 0.1635
Epoch: 0, Batch: 500, Loss: 0.1526
Epoch: 0, Batch: 600, Loss: 0.0565
Epoch: 0, Batch: 700, Loss: 0.0176
Epoch: 0
Average Loss: 0.1566
Training Accuracy: 0.9429
Validation Accuracy: 0.9648
Epoch: 1, Batch: 0, Loss: 0.0140
Epoch: 1, Batch: 100, Loss: 0.0100
Epoch: 1, Batch: 200, Loss: 0.0152
Epoch: 1, Batch: 300, Loss: 0.0065
Epoch: 1, Batch: 400, Loss: 0.0247
Epoch: 1, Batch: 500, Loss: 0.0055
Epoch: 1, Batch: 600, Loss: 0.0098
Epoch: 1, Batch: 700, Loss: 0.0048
Epoch: 1
Average Loss: 0.0370
Training Accuracy: 0.9897
Validation Accuracy: 0.9642
Epoch: 2, Batch: 0, Loss: 0.0046
Epoch: 2, Batch: 100, Loss: 0.0376
Epoch: 2, Batch: 200, Loss: 0.0024
Epoch: 2, Batch: 300, Loss: 0.0024
Epoch: 2, Batch: 400, Loss: 0.0024
Epoch: 2, Batch: 500, Loss: 0.0025
Epoch: 2, Batch: 600, Loss: 0.0022
Epoch: 2, Batch: 700, Loss: 0.006

## Step 8: Test the Model

In [13]:
model.eval()
with torch.no_grad():
    test_preds = []
    test_labels = []
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts).squeeze(1)  # Ensure outputs have the same shape as labels
        preds = torch.round(torch.sigmoid(outputs)).cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)
    print(f"Test Accuracy: {test_acc}")

Test Accuracy: 0.9671067352875363


In [14]:
def predict_text(model, text, vocab, max_len=200, device='cuda' if torch.cuda.is_available() else 'cpu'):
    # Set model to evaluation mode
    model.eval()
    
    # Tokenize and convert text to sequence
    sequence = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
    if len(sequence) < max_len:
        sequence.extend([vocab["<PAD>"]] * (max_len - len(sequence)))
    sequence = sequence[:max_len]
    
    # Convert to tensor
    sequence_tensor = torch.tensor([sequence], dtype=torch.long).to(device)
    
    # Get prediction
    with torch.no_grad():
        output = model(sequence_tensor)
        prediction = torch.sigmoid(output.squeeze())
        probability = prediction.item()
        binary_prediction = 1 if probability >= 0.5 else 0
    
    return binary_prediction, probability



In [15]:
# Load the saved model
model = MLPModel(embedding_matrix=embedding_matrix).to(device)
model.load_state_dict(torch.load('best_mlp_model.pth'))


  model.load_state_dict(torch.load('best_mlp_model.pth'))


<All keys matched successfully>

In [18]:
# Example usage
new_text1 = "The president has proposed a bew change"
new_text2 = "Donald trTrunp has proposed a new change"
prediction, probability = predict_text(model, new_text1, vocab)
print(f"Prediction: {'Fake' if prediction == 1 else 'Real'}")
print(f"Confidence: {probability:.2%}")

prediction, probability = predict_text(model, new_text2, vocab)
print(f"Prediction: {'Fake' if prediction == 1 else 'Real'}")
print(f"Confidence: {probability:.2%}")


Prediction: Real
Confidence: 0.99%
Prediction: Fake
Confidence: 99.22%


In [17]:
def analyze_training_bias(train_data, word_of_interest):
    # Filter articles containing the word
    contains_word = train_data[train_data['text'].str.contains(word_of_interest, case=False)]
    
    # Calculate statistics
    total_articles = len(contains_word)
    fake_articles = contains_word[contains_word['label'] == 1].shape[0]
    real_articles = contains_word[contains_word['label'] == 0].shape[0]
    
    print(f"Statistics for articles containing '{word_of_interest}':")
    print(f"Total articles: {total_articles}")
    print(f"Fake news: {fake_articles} ({fake_articles/total_articles*100:.2f}%)")
    print(f"Real news: {real_articles} ({real_articles/total_articles*100:.2f}%)")

# Usage
analyze_training_bias(train_data, "Trump")
analyze_training_bias(train_data, "president")

Statistics for articles containing 'Trump':
Total articles: 12425
Fake news: 7068 (56.89%)
Real news: 5357 (43.11%)
Statistics for articles containing 'president':
Total articles: 14852
Fake news: 6934 (46.69%)
Real news: 7918 (53.31%)
