In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


In [4]:
fake_df = pd.read_csv('archive2/Fake.csv')
true_df = pd.read_csv('archive2/True.csv')

fake_df['label'] = 0
true_df['label'] = 1

df = pd.concat([fake_df, true_df], ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [5]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

max_vocab_size = 10000
max_len = 500

def tokenize(text):
    return word_tokenize(text.lower())

# Build vocab
counter = Counter()
for text in X_train:
    counter.update(tokenize(text))

most_common = counter.most_common(max_vocab_size - 2)
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common, start=2):
    vocab[word] = i

def encode(text):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokenize(text)]

# Encode and pad
def process_data(texts):
    sequences = [torch.tensor(encode(t))[:max_len] for t in texts]
    return pad_sequence(sequences, batch_first=True, padding_value=0)

X_train_pad = process_data(X_train)
X_test_pad = process_data(X_test)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\azizd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.y)

train_ds = NewsDataset(X_train_pad, y_train_tensor)
test_ds = NewsDataset(X_test_pad, y_test_tensor)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=128)


In [9]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=64):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.dropout(hidden[-1])
        return torch.sigmoid(self.fc(out)).squeeze(1)


In [10]:
model = LSTMModel(len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.01)

train_loss_list = []
val_loss_list = []

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.float().to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    train_loss_list.append(total_loss / len(train_loader))

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.float().to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
    val_loss_list.append(val_loss / len(test_loader))
    print(f"Epoch {epoch+1}, Train Loss: {train_loss_list[-1]:.4f}, Val Loss: {val_loss_list[-1]:.4f}")


Epoch 1, Train Loss: 0.6909, Val Loss: 0.6903
Epoch 2, Train Loss: 0.6911, Val Loss: 0.6909
Epoch 3, Train Loss: 0.6917, Val Loss: 0.6915
Epoch 4, Train Loss: 0.6920, Val Loss: 0.6916
Epoch 5, Train Loss: 0.6922, Val Loss: 0.6917


In [11]:
model.eval()
all_preds = []
all_probs = []
with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu()
        all_probs.extend(outputs)
        preds = (outputs > 0.5).int()
        all_preds.extend(preds)

accuracy = accuracy_score(y_test, all_preds)
precision = precision_score(y_test, all_preds)
recall = recall_score(y_test, all_preds)
f1 = f1_score(y_test, all_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.5271
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
