In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import time
from tqdm.notebook import trange, tqdm

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch import Tensor
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.cuda.amp import autocast, GradScaler

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

torch.backends.cuda.matmul.allow_tf32 = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
%cd /content/drive/My Drive/Machine Learning/fake-news-detection/truth-guard-model

In [None]:
import pandas as pd

# Kaggle Fake and Real News Dataset
fake_df = pd.read_csv("kaggle/Fake.csv")[["title", "text"]]
fake_df["veracity"] = 0.0

real_df = pd.read_csv("kaggle/True.csv")[["title", "text"]]
real_df["veracity"] = 1.0

kaggle_df = pd.concat([fake_df, real_df], ignore_index=True)

In [None]:
kaggle_df

In [None]:
# Liar2 Dataset
train_df = pd.read_csv("liar2/train.csv")[["statement", "label"]]
val_df = pd.read_csv("liar2/valid.csv")[["statement", "label"]]
test_liar2 = pd.read_csv("liar2/test.csv")[["statement", "label"]]

train_liar2 = pd.concat([train_df, val_df], ignore_index=True)

train_liar2["veracity"] = train_liar2["label"] / 5
test_liar2["veracity"] = test_liar2["label"] / 5

train_liar2["text"] = train_liar2["statement"]
test_liar2["text"] = test_liar2["statement"]

train_liar2.drop(columns=["statement", "label"], inplace=True)
test_liar2.drop(columns=["statement", "label"], inplace=True)

In [None]:
train_liar2

In [None]:
from sklearn.model_selection import train_test_split

all_df = pd.concat([train_liar2, test_liar2, kaggle_df], ignore_index=True)
train_df, test_df = train_test_split(
    all_df, test_size=0.1, stratify=(all_df["veracity"] > 0.5), random_state=42
)

In [None]:
train_df

In [None]:
test_df

In [None]:
def compute_token_stats(df, tokenizer, add_special_tokens=True):
    tqdm.pandas()

    lengths = df["text"].progress_apply(
        lambda txt: len(
            tokenizer(
                txt,
                add_special_tokens=add_special_tokens,
                truncation=False,
                padding=False,
            )["input_ids"]
        )
    )

    max_len = int(lengths.max())
    avg_len = float(lengths.mean())

    return max_len, avg_len, lengths

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [None]:
train_max, train_avg, train_lengths = compute_token_stats(train_df, tokenizer)
print(f"Train: longest = {train_max} tokens, avg = {train_avg:.2f} tokens")

test_max, test_avg, test_lengths = compute_token_stats(test_df, tokenizer)
print(f"TestL: longest = {test_max} tokens, avg = {test_avg:.2f} tokens")

In [None]:
MAX_LEN = 512
tokenizer.vocab_size

In [None]:
print("[PAD] token id:", tokenizer.pad_token_id)  # 0
print("[CLS] token id:", tokenizer.cls_token_id)  # 101
print("[SEP] token id:", tokenizer.sep_token_id)  # 102

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["veracity"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, i):
        enc = tokenizer(
            self.texts[i],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[i], dtype=torch.float),
        }

In [None]:
batch_size = 16

train_dataloader = DataLoader(
    FakeNewsDataset(train_df),
    batch_size=batch_size,
    shuffle=True,
    num_workers=os.cpu_count(),
    pin_memory=True,
)

test_dataloader = DataLoader(
    FakeNewsDataset(test_df),
    batch_size=batch_size,
    shuffle=False,
    num_workers=os.cpu_count(),
    pin_memory=True,
)

len(train_dataloader), len(test_dataloader)

In [None]:
batch = next(iter(train_dataloader))
print(batch["input_ids"].shape, batch["attention_mask"].shape, batch["labels"].shape)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
).to(device)

In [None]:
texts = [
    "Breaking: Scientists discover cure for common cold!",
    "Study finds no link between vaccines and autism.",
]

encodings = tokenizer(
    texts,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt",
).to(device)


logits = model(
    input_ids=encodings.input_ids,
    attention_mask=encodings.attention_mask,
).logits.squeeze(-1)

print(logits)

In [None]:
print(
    f"The model has {(sum(p.numel() for p in model.parameters() if p.requires_grad)):,} trainable parameters"
)

In [None]:
from transformers import get_linear_schedule_with_warmup

lr = 2e-5
epochs = 5
clip = 1

optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()
scaler = GradScaler()

num_steps = len(train_dataloader) * epochs
num_warmup = int(0.1 * num_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup, num_training_steps=num_steps
)

In [None]:
def train(model, iterator, optimizer, loss_fn, clip, epoch):
    model.train()
    epoch_loss = 0

    pbar = tqdm(
        iterator,
        total=len(iterator),
        desc=f"Epoch {epoch + 1} Progress",
        colour="#005500",
    )
    for i, batch in enumerate(pbar):
        src = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with autocast():
            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the loss
            loss = loss_fn(logits, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()
        epoch_loss += loss.item()

        pbar.set_postfix(loss=loss.item())  # Update the loss on the tqdm progress bar

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, model_path, iterator, loss_fn):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    epoch_loss = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the loss
            loss = loss_fn(logits, labels)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float("inf")
model_path = "truth_guard_model.pt"

if os.path.exists(model_path):
    print(f"Loading model from {model_path}...")
    model.load_state_dict(torch.load(model_path, map_location=device))

In [None]:
should_train = True

if should_train:
    for epoch in tqdm(range(epochs), desc=f"Training progress", colour="#00ff00"):
        start_time = time.time()

        train_loss = train(
            model=model,
            iterator=train_dataloader,
            optimizer=optimizer,
            loss_fn=loss_fn,
            clip=clip,
            epoch=epoch,
        )

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        message = f"Epoch: {epoch + 1} | Time: {epoch_mins}m {epoch_secs}s --> STORED"

        torch.save(model.state_dict(), f"truth_guard_model_epoch_{epoch + 1}.pt")

        print(message)
        print(f"Train Loss: {train_loss:.6f}")

In [None]:
test_loss = evaluate(
    model=model,
    model_path="truth_guard_model_epoch_5.pt",
    iterator=test_dataloader,
    loss_fn=loss_fn,
)

print(f"Test Loss: {test_loss:.6f}")

# Test Losses:
# Epoch 1 - 0.214233
# Epoch 2 - 0.213548
# Epoch 3 - 0.222628
# Epoch 4 - 0.227762
# Epoch 5 - 0.231639

In [None]:
def get_accuracy(model, model_path, iterator):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    num_correct = 0
    total = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the accuracy
            probs = torch.sigmoid(logits)
            preds = probs >= 0.5
            truths = labels >= 0.5

            num_correct += (preds == truths).sum().item()
            total += labels.size(0)

    return num_correct / total

In [None]:
test_acc = get_accuracy(
    model=model, model_path="truth_guard_model_epoch_5.pt", iterator=test_dataloader
)

print(f"Test Accuracy: {test_acc:.6f}")

# Test Accuracies:
# Epoch 1 - 0.898762
# Epoch 2 - 0.900383
# Epoch 3 - 0.897583
# Epoch 4 - 0.893604
# Epoch 5 - 0.893457

In [None]:
def get_prob_accuracy(model, model_path, iterator, tolerance: float = 0.2):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    num_correct = 0
    total = 0

    with torch.inference_mode():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=src,
                attention_mask=mask,
            )
            logits = outputs.logits.squeeze(-1)  # shape: (batch_size)

            # Calculate the accuracy
            probs = torch.sigmoid(logits)

            diffs = torch.abs(probs - labels)
            num_correct += (diffs <= tolerance).sum().item()

            total += labels.size(0)

    return num_correct / total

In [None]:
test_prob_acc = get_prob_accuracy(
    model=model,
    model_path="truth_guard_model_epoch_5.pt",
    iterator=test_dataloader,
    tolerance=0.2,
)

print(f"Test Prob Accuracy: {test_prob_acc:.6f}")

# Test Prob Accuracies:
# Epoch 1 - 0.834365
# Epoch 2 - 0.837165
# Epoch 3 - 0.846743
# Epoch 4 - 0.843943
# Epoch 5 - 0.843207

In [None]:
def get_prediction(text, model, device, max_length: int = MAX_LEN):
    model.eval()

    encodings = tokenizer(
        text, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
    )

    with torch.inference_mode():
        logits = model(
            input_ids=encodings.input_ids.to(device),
            attention_mask=encodings.attention_mask.to(device),
        ).logits.squeeze(-1)

    prob = torch.sigmoid(logits)
    return prob

In [None]:
test_df.iloc[0], test_df.iloc[1]

In [None]:
test_df.iloc[115]["text"]

In [None]:
test_idx = 115

src_text = test_df.iloc[test_idx]["text"]

model.load_state_dict(torch.load("truth_guard_model_epoch_3.pt", map_location=device))
prob = get_prediction(src_text, model, device)

# 0 - fake
# 1 - real

print(f"Real label: {test_df.iloc[test_idx]['veracity']}")
print(f"Predicted Prob: {prob}")

In [None]:
!pip install onnx onnxruntime

In [None]:
import onnx
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)
model.load_state_dict(torch.load("truth_guard_model_epoch_3.pt", map_location="cpu"))
model.eval()
model = model.to("cpu")

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

save_dir = "./onnx_model"

pipeline = transformers.pipeline(
    "text-classification", model=model, tokenizer=tokenizer
)
model = model.to("cpu")

In [None]:
with torch.no_grad():
    onnx_convert.convert_pytorch(
        pipeline, opset=14, output=Path("truth_guard.onnx"), use_external_format=False
    )

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    "truth_guard.onnx", "truth_guard_int8.onnx", weight_type=QuantType.QUInt8
)

In [None]:
# tokenizer.save_pretrained("./tokenizer")