## Fake News Analysis Model

This models aims to detect the validity of articles based upon writing patterns.

1. Loading data from csv file and filter out empty rows, split the data in training and test split.


In [8]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="data\WELFake_Dataset.csv")

def filter_empty(data):
    return (data["title"] is not None) and (data["text"] is not None)

dataset = dataset.filter(filter_empty)

split_dataset = dataset["train"].train_test_split(test_size=0.2)


2. Import Bertokenizer (specific autotokenizer for BERT models), convert articles from data to tokenized training and test dicts.

In [9]:
from transformers import BertTokenizer
from datasets import DatasetDict


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(batch):
    full_texts = [t + " " + x for t, x in zip(batch["title"], batch["text"])]
    return tokenizer(full_texts, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = DatasetDict({
    "train": split_dataset["train"].map(tokenize_fn, batched=True),
    "test": split_dataset["test"].map(tokenize_fn, batched=True)
})

Map: 100%|██████████| 57229/57229 [08:57<00:00, 106.38 examples/s]
Map: 100%|██████████| 14308/14308 [02:11<00:00, 108.58 examples/s]


3. Import BERT, use pretrained weights

In [10]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. If nvidia GPU is available move to GPU, else stay on CPU

In [11]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

5. use sklearn.metrics for basic evaluation

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

6. Use huggingface trainer api for training, specify trainingarguments, train model, safe best model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,  
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


trainer.train() 
trainer.save_model("./bert-fake-news")


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0379,0.044022,0.990355,0.999019,0.981951,0.990411
2,0.0006,0.016271,0.996575,0.99397,0.999311,0.996633
3,0.0072,0.019461,0.996785,0.997517,0.996142,0.996829
4,0.0,0.017923,0.997484,0.997932,0.997107,0.997519
5,0.0,0.02179,0.997484,0.997794,0.997244,0.997519


7. print basic evalution

In [None]:
preds = trainer.predict(tokenized_dataset["test"])
print(preds.metrics)

{'test_loss': 0.027281897142529488, 'test_runtime': 580.1707, 'test_samples_per_second': 24.662, 'test_steps_per_second': 3.084}
