# Google colab setup
## A Convolutional Neural Networks approach

In [67]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [68]:
%%capture
!pip install datasets torchmetrics gradio kaggle typing-extensions

In [69]:
!cp -r /content/drive/MyDrive/big_data_project/* .

# General setup

In [70]:
!python -m nltk.downloader stopwords omw-1.4 wordnet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
import os
import re
import sys
import pandas as pd
import numpy as np
import torch
import torchtext
import torchtext.data
import torch.nn as nn
from nltk.corpus import stopwords
from torchmetrics import Accuracy, Precision, Recall, F1Score, MetricCollection
import torch.nn.functional as F
import torch.optim as optim
sys.path.append('.')
from data_pipeline import get_dataloaders, text_preprocess, isot_clean, load_dataset
from models import cnn, lstm, bert
from models.cnn import CNN, train, evaluate
from models.lstm import LSTMNet
from models.bert import BERTModel

In [121]:
DATASET_ID = "isot"
MODEL = "cnn"
(train_loader, val_loader, test_loader), (tokenizer, vocab) = get_dataloaders(dataset_id=DATASET_ID, model=MODEL)

# CNN

In [122]:
cnn_model_config = {
    "vocab_size": len(vocab),
    "embedding_dim": 100,
    "n_filters": 100,
    "filter_sizes": [3,4,5],
    "output_dim": 1,
    "dropout": 0.5,
    "pad_idx": 1
}

In [123]:
cnn_model = CNN(**cnn_model_config)

In [124]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [125]:
cnn_model = cnn_model.to(device)
optimizer = optim.Adam(cnn_model.parameters(),lr=1e-4)
criterion = nn.BCELoss(reduction="none")
criterion = criterion.to(device)

## Dataset 1 (Fake News)

In [20]:
EPOCHS = 12

metrics_group = MetricCollection(
    [
        Accuracy(task="binary"),
        Precision(task="binary"),
        Recall(task="binary"),
        F1Score(task="binary")
    ]
).to(device)
class_weight = torch.tensor([1.0, 1.0]).to(device)
best_score = None
best_state_dict = None

for epoch in range(EPOCHS):
    print(f"Epoch {epoch}")

    train_loss, train_metrics = train(cnn_model, train_loader, optimizer, criterion, metrics_group, class_weight, device)
    metrics_group.reset()
    valid_loss, valid_metrics = evaluate(cnn_model, val_loader, criterion, metrics_group, class_weight, device)
    metrics_group.reset()

    valid_f1 = valid_metrics["BinaryF1Score"].cpu().item()
    if (best_score is None) or (valid_f1 > best_score):
        best_score = valid_f1
        best_state_dict = cnn_model.state_dict()

    print("Train")
    print(f"Loss: {train_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in train_metrics.items()]))

    print("Validation")
    print(f"Loss: {valid_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in valid_metrics.items()]))
    print()

Epoch 0
Train
Loss: 0.677, BinaryAccuracy: 60.03, BinaryPrecision: 59.72, BinaryRecall: 56.14, BinaryF1Score: 57.87
Validation
Loss: 0.493, BinaryAccuracy: 82.95, BinaryPrecision: 86.85, BinaryRecall: 77.80, BinaryF1Score: 82.08

Epoch 1
Train
Loss: 0.463, BinaryAccuracy: 77.31, BinaryPrecision: 77.06, BinaryRecall: 76.32, BinaryF1Score: 76.69
Validation
Loss: 0.359, BinaryAccuracy: 86.14, BinaryPrecision: 81.94, BinaryRecall: 92.86, BinaryF1Score: 87.06

Epoch 2
Train
Loss: 0.360, BinaryAccuracy: 83.44, BinaryPrecision: 83.27, BinaryRecall: 82.78, BinaryF1Score: 83.02
Validation
Loss: 0.302, BinaryAccuracy: 88.18, BinaryPrecision: 86.67, BinaryRecall: 90.35, BinaryF1Score: 88.47

Epoch 3
Train
Loss: 0.305, BinaryAccuracy: 86.41, BinaryPrecision: 86.14, BinaryRecall: 86.05, BinaryF1Score: 86.10
Validation
Loss: 0.266, BinaryAccuracy: 89.63, BinaryPrecision: 87.85, BinaryRecall: 92.08, BinaryF1Score: 89.92

Epoch 4
Train
Loss: 0.268, BinaryAccuracy: 88.36, BinaryPrecision: 87.90, Binary

### Kaggle submit (Dataset 1 only-evaluation)

In [21]:
submission_ids = []
submission_labels = []
cnn_model.load_state_dict(best_state_dict)
cnn_model.eval()
with torch.no_grad():
    for batch in test_loader:
        text, _, _, ids = batch
        text = text.to(device)
        submission_ids.extend(ids.tolist())
        predictions = cnn_model(text).round().int().squeeze().detach().cpu().tolist()
        submission_labels.extend(predictions)

In [22]:
df_test = pd.read_csv("./data/Dset1/test.csv")
for idx in df_test["id"]:
     if idx not in submission_ids:
        submission_ids.append(idx)
        submission_labels.append(1)

In [23]:
submission_df = pd.DataFrame.from_dict({"id": submission_ids, "label": submission_labels})

In [24]:
submission_df.to_csv("submission.csv",index=False)

In [25]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"andreilabaucristea","key":"3d2d961388a3991779d8a25e297c991b"}'}

In [26]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [27]:
!kaggle competitions submit -c fake-news -f submission.csv -m "Uploaded from Google Colab"

100% 40.6k/40.6k [00:01<00:00, 24.9kB/s]
Successfully submitted to Fake News

## Dataset 2 (ISOT)

In [126]:
EPOCHS = 3

metrics_group = MetricCollection(
    [
        Accuracy(task="binary"),
        Precision(task="binary"),
        Recall(task="binary"),
        F1Score(task="binary")
    ]
).to(device)
class_weight = torch.tensor([1.0, 1.0]).to(device)
best_score = None
best_state_dict = None

for epoch in range(EPOCHS):
    print(f"Epoch {epoch}")

    train_loss, train_metrics = train(cnn_model, train_loader, optimizer, criterion, metrics_group, class_weight, device)
    metrics_group.reset()
    valid_loss, valid_metrics = evaluate(cnn_model, val_loader, criterion, metrics_group, class_weight, device)
    metrics_group.reset()

    valid_f1 = valid_metrics["BinaryF1Score"].cpu().item()
    if (best_score is None) or (valid_f1 > best_score):
        best_score = valid_f1
        best_state_dict = cnn_model.state_dict()

    print("Train")
    print(f"Loss: {train_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in train_metrics.items()]))

    print("Validation")
    print(f"Loss: {valid_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in valid_metrics.items()]))
    print()

Epoch 0
Train
Loss: 0.556, BinaryAccuracy: 70.33, BinaryPrecision: 69.20, BinaryRecall: 66.08, BinaryF1Score: 67.61
Validation
Loss: 0.335, BinaryAccuracy: 89.71, BinaryPrecision: 93.40, BinaryRecall: 85.09, BinaryF1Score: 89.05

Epoch 1
Train
Loss: 0.308, BinaryAccuracy: 87.37, BinaryPrecision: 86.74, BinaryRecall: 86.21, BinaryF1Score: 86.48
Validation
Loss: 0.222, BinaryAccuracy: 92.50, BinaryPrecision: 92.57, BinaryRecall: 92.14, BinaryF1Score: 92.35

Epoch 2
Train
Loss: 0.228, BinaryAccuracy: 91.21, BinaryPrecision: 90.80, BinaryRecall: 90.40, BinaryF1Score: 90.60
Validation
Loss: 0.173, BinaryAccuracy: 94.88, BinaryPrecision: 95.43, BinaryRecall: 94.10, BinaryF1Score: 94.76



In [127]:
cnn_model.load_state_dict(best_state_dict)
test_loss, test_metrics = evaluate(
    cnn_model,
    test_loader,
    criterion,
    metrics_group,
    class_weight,
    device
)
metrics_group.reset()

print("Test")
print(f"Loss: {test_loss:.3f}", end=", ")
print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in test_metrics.items()]))

Test
Loss: 0.165, BinaryAccuracy: 94.46, BinaryPrecision: 95.18, BinaryRecall: 93.64, BinaryF1Score: 94.40


In [128]:
MODEL='cnn'
DATASET_ID='isot'
torch.save(
    {
        "epoch": epoch+1,
        "model_state_dict": cnn_model.state_dict(),
        "model_config": cnn_model_config,
        "optimizer_state_dict": optimizer.state_dict()
    },
    f"checkpoints/{MODEL}_{DATASET_ID}.tar"
)

## Dataset 3 (Liar)

In [None]:
EPOCHS = 2

metrics_group = MetricCollection(
    [
        Accuracy(task="binary"),
        Precision(task="binary"),
        Recall(task="binary"),
        F1Score(task="binary")
    ]
).to(device)
class_weight = torch.tensor([6.25, 1.2]).to(device)

for epoch in range(EPOCHS):
    print(f"Epoch {epoch}")

    train_loss, train_metrics = train(cnn_model, train_loader, optimizer, criterion, metrics_group, class_weight, device)
    metrics_group.reset()
    valid_loss, valid_metrics = evaluate(cnn_model, val_loader, criterion, metrics_group, class_weight, device)
    metrics_group.reset()

    print("Train")
    print(f"Loss: {train_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in train_metrics.items()]))

    print("Validation")
    print(f"Loss: {valid_loss:.3f}", end=", ")
    print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in valid_metrics.items()]))
    print()

Epoch 0
Train
Loss: 1.664, BinaryAccuracy: 48.15, BinaryPrecision: 83.09, BinaryRecall: 47.68, BinaryF1Score: 60.59
Validation
Loss: 1.330, BinaryAccuracy: 51.25, BinaryPrecision: 86.99, BinaryRecall: 51.57, BinaryF1Score: 64.75

Epoch 1
Train
Loss: 1.404, BinaryAccuracy: 55.33, BinaryPrecision: 87.82, BinaryRecall: 54.07, BinaryF1Score: 66.93
Validation
Loss: 1.311, BinaryAccuracy: 57.48, BinaryPrecision: 88.81, BinaryRecall: 58.39, BinaryF1Score: 70.45



In [None]:
metrics_group = MetricCollection(
    [
        Accuracy(task="binary"),
        Precision(task="binary"),
        Recall(task="binary"),
        F1Score(task="binary")
    ]
).to(device)
class_weight = torch.tensor([6.25, 1.2]).to(device)
test_loss, test_metrics = evaluate(cnn_model, test_loader, criterion, metrics_group, class_weight, device)
metrics_group.reset()

print("Test")
print(f"Loss: {test_loss:.3f}", end=", ")
print(", ".join([f"{k}: {v.cpu().item() * 100:.2f}" for k, v in test_metrics.items()]))

Test
Loss: 0.512, BinaryAccuracy: 94.23, BinaryPrecision: 96.58, BinaryRecall: 91.68, BinaryF1Score: 94.06


## UI

## Load models

In [109]:
# Load LSTM model
MODEL='lstm'
DATASET_ID='isot'
checkpoint = torch.load(f"checkpoints/{MODEL}_{DATASET_ID}.tar", map_location=device)
lstm_model = LSTMNet(**checkpoint["model_config"])
lstm_model.load_state_dict(checkpoint["model_state_dict"])
lstm_model.eval()
lstm_model = lstm_model.to(device)

In [98]:
# Load BERT model
MODEL='bert'
DATASET_ID='isot'
(_, _, _), (bert_tokenizer, vocab) = get_dataloaders(dataset_id=DATASET_ID, model=MODEL)
checkpoint = torch.load(f"checkpoints/{MODEL}_{DATASET_ID}.tar", map_location=device)
bert_model = BERTModel(**checkpoint["model_config"])
bert_model.load_state_dict(checkpoint["model_state_dict"])
bert_model.eval()
bert_model = bert_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [129]:
# Load CNN model
MODEL='cnn'
checkpoint = torch.load(f"checkpoints/{MODEL}_{DATASET_ID}.tar", map_location=device)
cnn_model = CNN(**checkpoint["model_config"])
cnn_model.load_state_dict(checkpoint["model_state_dict"])
cnn_model.eval()
cnn_model = cnn_model.to(device)

In [100]:
fake_df = pd.read_csv("./data/Dset2/Fake.csv")
true_df = pd.read_csv("./data/Dset2/True.csv")

In [None]:
isot_clean(fake_df["text"].sample().iloc[0])



In [None]:
isot_clean(true_df["text"].sample().iloc[0])

'The U.S. Senate Armed Services Committee backed a waiver on Thursday that will allow James Mattis to serve as President-elect Donald Trump’s secretary of defense, despite having retired as a Marine General in 2013. The panel voted 24-3 to waive a law on civilian control of the U.S. military that would have barred Mattis from assuming the position for seven years after his active duty service. The “no” votes came from three Democrats: Senators Richard Blumenthal, Kirsten Gillibrand and Elizabeth Warren. The waiver must still be approved by the full Senate, the House of Representatives Armed Services Committee and the full House to allow Mattis to serve if he is confirmed to lead the Pentagon. '

In [130]:
labels = ['True news', 'Fake news']
def select_inference(model_name, input_str):
    # Choose the model based on the selected radio button
    if model_name.lower() == "cnn":
        probs = cnn.inference(cnn_model, input_str, isot=True, device=device, processor=(tokenizer, vocab))
    elif model_name.lower() == "bert":
        probs = bert.inference(bert_model, input_str, isot=True, device=device, processor=(bert_tokenizer, vocab))
    elif model_name.lower() == "lstm":
        probs = lstm.inference(lstm_model, input_str, isot=True, device=device, processor=(tokenizer, vocab))
    else:
        raise ValueError("Invalid model name")

    probs_with_labels = {}
    for k, v in zip(labels, probs):
        probs_with_labels[k] = float(v)

    return probs_with_labels

In [108]:
print(select_inference('lstm', fake_df["text"].sample().iloc[0]))
print(select_inference('lstm', true_df["text"].sample().iloc[0]))

{'True news': 0.9985734792426229, 'Fake news': 0.0014265207573771477}
{'True news': 0.9966191600542516, 'Fake news': 0.0033808399457484484}


In [138]:
print(select_inference('cnn', fake_df["text"].sample().iloc[0]))
print(select_inference('cnn', true_df["text"].sample().iloc[0]))

{'True news': 0.09955835342407227, 'Fake news': 0.9004416465759277}
{'True news': 0.8765920624136925, 'Fake news': 0.12340793758630753}


In [139]:
print(select_inference('bert', fake_df["text"].sample().iloc[0]))
print(select_inference('bert', true_df["text"].sample().iloc[0]))

{'True news': 2.8087177270208485e-05, 'Fake news': 0.999971866607666}
{'True news': 0.999874472618103, 'Fake news': 0.0001254865201190114}


In [140]:
import gradio as gr

model_names = ["CNN", "LSTM", "BERT"]

with gr.Blocks() as demo:
  with gr.Tab("Fake News Detection"):
    with gr.Row():
      with gr.Column():
        model_list = gr.Dropdown(
            choices = model_names,
            value = model_names[0],
            label="Training method",
            allow_custom_value=False,
            info="Select trained model for fake news detection"
        )
        in_text = gr.Textbox(label="Input news", type="text", lines=5, value=isot_clean(fake_df["text"].sample().iloc[0]))
        with gr.Row():
          with gr.Column():
            submit_btn = gr.Button(value="Run", variant='primary')
            clear_btn = gr.ClearButton(variant='secondary', components=[in_text])

          ds = gr.Dataset(
                components=[gr.Textbox(visible=False),gr.Textbox(visible=False)],
                headers=["Id","News class"],
                samples=[["1","FAKE"],["2","TRUE"]],
                type="index"
          )
          out_field = gr.Label(num_top_classes=2, label="Prediction")
  submit_btn.click(
    fn=select_inference,
    inputs=[model_list, in_text],
    outputs=[out_field]
  )
  ds.click(
    fn=lambda idx: gr.update(value = isot_clean(true_df["text"].sample().iloc[0]) if idx == 1 else isot_clean(fake_df["text"].sample().iloc[0])),
    inputs=ds,
    outputs=in_text
  )

demo.queue()
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://070a2bfe4c74a643b0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


