In [1]:
# pip install faknow pandas shap
# pip install torch_geometric
# pip install torch_scatter
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [2]:
import pandas as pd
import faknow
import shap
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [3]:
true_news = pd.read_csv('Dataset/True.csv')
fake_news = pd.read_csv('Dataset/Fake.csv')

true_news['label'] = 1
fake_news['label'] = 0

data = pd.concat([true_news, fake_news], ignore_index=True)

data['domain'] = 0 # dummy domain because faknow expects a "domain" column in the dataset and I don't have one

data = data[['text', 'label', 'domain']]

data_json = data.to_dict(orient='records')

from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(data_json, test_size=0.4, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


with open('Dataset/train.json', 'w') as f:
    json.dump(train_data, f)
with open('Dataset/val.json', 'w') as f:
    json.dump(validation_data, f)
with open('Dataset/test.json', 'w') as f:
    json.dump(test_data, f)


## Train (training with FaKnow's built-in capabilities)

In [4]:
from faknow.run.content_based.run_mdfend import run_mdfend, run_mdfend_from_yaml

num_epochs = 10

# arguments
kargs = {
    'train_path': 'Dataset/train.json',
    'test_path': 'Dataset/test.json',
    'validate_path': 'Dataset/val.json',
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'num_epochs': num_epochs
}

run_mdfend(**kargs)


2024-05-22 18:33:35,689 - INFO - Tensorboard log is saved in tb_logs/MDFEND-2024-05-22-18_33_35
2024-05-22 18:33:35,690 - INFO - log file is saved in logs/tb_logs/MDFEND-2024-05-22-18_33_35.log

2024-05-22 18:33:35,690 - INFO - training data size=26938
2024-05-22 18:33:35,691 - INFO - validation data size=8980
2024-05-22 18:33:35,691 - INFO - ----start training-----

2024-05-22 18:33:35,692 - INFO - epoch=[0/9]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Training: 100%|███████████████████████████████████████████████████| 421/421 [03:05<00:00,  2.27it/s]
2024-05-22 18:36:41,434 - INFO - training time=3m5s
2024-05-22 18:36:41,434 - INFO - training loss : loss=0.000351
2024-05-22 18:37:37,740 - INFO - validation result : accuracy=0.999332    precision=1.000000    recall=0.998599    f1=0.999299
2024-05-22 18:37:37,740 - INFO - current score : 0.999332
2024-05-22 18:37:37,742 - INFO - epoch=[1/9]
Training: 100%|███████████████████████████████████████████████████| 421/4

## Training (manual approach to training and evaluating, allows detailed customization)

In [5]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from faknow.model.content_based.mdfend import MDFEND
from faknow.data.dataset.text import TextDataset
from faknow.data.process.text_process import TokenizerFromPreTrained
from faknow.evaluate.evaluator import Evaluator
from faknow.train.trainer import BaseTrainer
from torch.utils.data import DataLoader

# tokenizer
max_len, bert = 170, 'bert-base-uncased'
tokenizer = TokenizerFromPreTrained(max_len, bert)

batch_size = 64
train_set = TextDataset('Dataset/train.json', ['text'], tokenizer)
train_loader = DataLoader(train_set, batch_size, shuffle=True)
validate_set = TextDataset('Dataset/val.json', ['text'], tokenizer)
val_loader = DataLoader(validate_set, batch_size, shuffle=False)
test_set = TextDataset('Dataset/test.json', ['text'], tokenizer)
test_loader = DataLoader(test_set, batch_size, shuffle=False)

# model hyperparameters
domain_num = 9
model = MDFEND(bert, domain_num).to(device)

# optimizer and scheduler
lr, weight_decay, step_size, gamma = 0.00005, 5e-5, 100, 0.98
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma)

# evaluate
evaluator = Evaluator()

# train
num_epochs = 10
trainer = BaseTrainer(model, evaluator, optimizer, scheduler, device=device)
trainer.fit(train_loader, num_epochs, validate_loader=val_loader)


# metrics
train_results = trainer.evaluate(train_loader)
print(f"Train Accuracy: {train_results['accuracy']}")
print(f"Train Precision: {train_results['precision']}")
print(f"Train Recall: {train_results['recall']}")
print(f"Train F1-Score: {train_results['f1']}")

print ("======================================================================")

val_results = trainer.evaluate(val_loader)
print(f"Validation Accuracy: {val_results['accuracy']}")
print(f"Validation Precision: {val_results['precision']}")
print(f"Validation Recall: {val_results['recall']}")
print(f"Validation F1-Score: {val_results['f1']}")

print ("======================================================================")

test_results = trainer.evaluate(test_loader)
print(f"Test Accuracy: {test_results['accuracy']}")
print(f"Test Precision: {test_results['precision']}")
print(f"Test Recall: {test_results['recall']}")
print(f"Test F1-Score: {test_results['f1']}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
2024-05-22 19:13:56,370 - INFO - Tensorboard log is saved in tb_logs/MDFEND-2024-05-22-19_13_56
2024-05-22 19:13:56,370 - INFO - Tensorboard log is saved in tb_logs/MDFEND-2024-05-22-19_13_56
2024-05-22 19:13:56,371 - INFO - log file is saved in logs/tb_logs/MDFEND-2024-05-22-19_13_56.log

2024-05-22 19:13:56,371 - INFO - log file is saved in logs/tb_logs/MDFEND-2024-05-22-19_13_56.log

2024-05-22 19:13:56,372 - INFO - training data size=26938
2024-05-22 19:13:56,372 - INFO - training data size=26938
2024-05-22 19:13:56,373 - INFO - validation data size=8980
2024-05-22 19:13:56,373 - INFO - validation data size=8980
2024-05-22 19:13:56,374 - INFO - ----start training-----

2024-05-22 19:13:56,374 - INFO - ----start training--

Train Accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0
Train F1-Score: 1.0
Validation Accuracy: 0.9994432330131531
Validation Precision: 1.0
Validation Recall: 0.9988323213451659
Validation F1-Score: 0.9994158196050941
Test Accuracy: 0.9992204904556274
Test Precision: 0.9995366079703429
Test Recall: 0.9988423246121787
Test F1-Score: 0.999189345686161
