In [1]:
# =====================================================
# BASELINE: LSTM (Bi-directional) - WELFake
# Framework: PyTorch
# =====================================================

import os, re, time, pickle, psutil
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. SETUP & CONFIG
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/WELFake_LSTM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Hyperparameters
MAX_VOCAB_SIZE = 20000  # Ch·ªâ gi·ªØ 20k t·ª´ ph·ªï bi·∫øn nh·∫•t
MAX_SEQ_LEN = 300       # ƒê·ªô d√†i t·ªëi ƒëa c·ªßa m·ªói c√¢u (c·∫Øt ho·∫∑c pad)
EMBEDDING_DIM = 100     # K√≠ch th∆∞·ªõc vector t·ª´
HIDDEN_DIM = 128        # S·ªë noron trong l·ªõp ·∫©n LSTM
BATCH_SIZE = 64
EPOCHS = 5
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {DEVICE}")

# 2. LOAD & CLEAN DATA
print("\n‚è≥ ƒêang t·∫£i dataset WELFake...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])

def clean_text_dl(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s) # Ch·ªâ gi·ªØ ch·ªØ th∆∞·ªùng v√† s·ªë
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
df['content'] = (df['title'].fillna('') + " " + df['text'].fillna('')).apply(clean_text_dl)
df = df[df['content'].str.len() > 50]

# 3. SPLIT DATA
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'].values, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']
)

# 4. TOKENIZER & VOCABULARY BUILDING
print("\n‚öôÔ∏è ƒêang x√¢y d·ª±ng b·ªô t·ª´ v·ª±ng (Vocabulary)...")

# ƒê·∫øm t·∫ßn su·∫•t t·ª´
word_counts = Counter()
for text in X_train_text:
    word_counts.update(text.split())

# T·∫°o vocab t·ª´ top t·ª´ ph·ªï bi·∫øn nh·∫•t
common_words = word_counts.most_common(MAX_VOCAB_SIZE - 2) # Tr·ª´ ch·ªó cho <PAD> v√† <UNK>
vocab = {word: i+2 for i, (word, _) in enumerate(common_words)}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

print(f"K√≠ch th∆∞·ªõc b·ªô t·ª´ v·ª±ng: {len(vocab)}")

# H√†m m√£ h√≥a text th√†nh list c√°c s·ªë nguy√™n
def encode_text(text, vocab, max_len):
    tokens = text.split()
    # Map t·ª´ sang index, n·∫øu kh√¥ng c√≥ th√¨ d√πng index c·ªßa <UNK>
    encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    # C·∫Øt ho·∫∑c pad
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded = encoded + [vocab['<PAD>']] * (max_len - len(encoded))
    return encoded

# 5. CUSTOM DATASET CLASS
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded = encode_text(text, self.vocab, self.max_len)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# T·∫°o DataLoader
train_dataset = FakeNewsDataset(X_train_text, y_train, vocab, MAX_SEQ_LEN)
test_dataset = FakeNewsDataset(X_test_text, y_test, vocab, MAX_SEQ_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False)

# 6. MODEL ARCHITECTURE (LSTM)
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        # Embedding Layer: Bi·∫øn index t·ª´ th√†nh vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # LSTM Layer
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout=dropout if n_layers > 1 else 0)

        # Fully Connected Layer
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        # Activation
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text shape: [batch size, sent len]
        embedded = self.dropout(self.embedding(text))

        # output shape: [batch size, sent len, hid dim * num directions]
        # hidden shape: [num layers * num directions, batch size, hid dim]
        output, (hidden, cell) = self.lstm(embedded)

        # L·∫•y hidden state cu·ªëi c√πng ƒë·ªÉ ph√¢n lo·∫°i
        # N·∫øu bidirectional, ta n·ªëi hidden state c·ªßa chi·ªÅu thu·∫≠n v√† chi·ªÅu ngh·ªãch
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        return self.fc(hidden)

# Kh·ªüi t·∫°o m√¥ h√¨nh
model = LSTMClassifier(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=1,
    n_layers=2,
    bidirectional=True,
    dropout=0.3
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy k·∫øt h·ª£p Sigmoid

# 7. TRAINING LOOP
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

print(f"\nüöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM tr√™n {DEVICE}...")

for epoch in range(EPOCHS):
    start_time = time.time()

    # Train
    model.train()
    train_loss = 0
    train_acc = 0

    for text, label in train_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)

        optimizer.zero_grad()
        predictions = model(text).squeeze(1)

        loss = criterion(predictions, label)
        acc = binary_accuracy(predictions, label)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += acc.item()

    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    print(f'Epoch: {epoch+1:02} | Time: {int(epoch_mins)}m {int(epoch_secs)}s')
    print(f'\tTrain Loss: {train_loss / len(train_loader):.3f} | Train Acc: {train_acc / len(train_loader)*100:.2f}%')

# 8. EVALUATION (HuggingFace Style)
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

model.eval()
all_preds = []
all_labels = []
all_probs = []

start_eval = time.time()

with torch.no_grad():
    for text, label in test_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)
        predictions = model(text).squeeze(1)
        prob = torch.sigmoid(predictions)

        all_probs.extend(prob.cpu().numpy())
        all_preds.extend(torch.round(prob).cpu().numpy())
        all_labels.extend(label.cpu().numpy())

end_eval = time.time()
runtime = end_eval - start_eval
samples_per_second = len(all_labels) / runtime

# Metrics
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
auc = roc_auc_score(all_labels, all_probs)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (Test Loop)',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A'
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ LSTM BASELINE:")
print("="*50)
print(eval_results)
print("="*50)

# 9. SAVE
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "lstm_model.pth"))
with open(os.path.join(OUTPUT_DIR, "vocab.pkl"), "wb") as f:
    pickle.dump(vocab, f)
print("‚úÖ ƒê√£ l∆∞u model!")

Mounted at /content/drive
Device: cuda

‚è≥ ƒêang t·∫£i dataset WELFake...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-290868f0a36350(‚Ä¶):   0%|          | 0.00/152M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/72134 [00:00<?, ? examples/s]

üßπ Pre-processing...

‚öôÔ∏è ƒêang x√¢y d·ª±ng b·ªô t·ª´ v·ª±ng (Vocabulary)...
K√≠ch th∆∞·ªõc b·ªô t·ª´ v·ª±ng: 20000

üöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM tr√™n cuda...
Epoch: 01 | Time: 0m 38s
	Train Loss: 0.219 | Train Acc: 91.06%
Epoch: 02 | Time: 0m 38s
	Train Loss: 0.148 | Train Acc: 94.43%
Epoch: 03 | Time: 0m 38s
	Train Loss: 0.096 | Train Acc: 96.45%
Epoch: 04 | Time: 0m 38s
	Train Loss: 0.064 | Train Acc: 97.69%
Epoch: 05 | Time: 0m 39s
	Train Loss: 0.048 | Train Acc: 98.26%

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ LSTM BASELINE:
{'eval_accuracy': 0.9787767030825969, 'eval_precision': 0.9789217638661222, 'eval_recall': 0.9787767030825969, 'eval_f1': 0.9787702264467999, 'eval_auc': np.float64(0.9976278503596667), 'eval_loss': 'N/A (Test Loop)', 'eval_runtime': 4.406351089477539, 'eval_samples_per_second': 3261.428721446699, 'eval_steps_per_second': 'N/A'}
‚úÖ ƒê√£ l∆∞u model!
