# NLP based - Fake News Classification with BERT model

## About Dataset

- This dataset has been downloaded from kaggle: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
- Here is the link to lincense: https://creativecommons.org/licenses/by/4.0/

Note: We have already downloaded and decompressed the data in the same directory as notebook

### Dependencies

#### Before running this notebook, please make sure you have already installed the following libraries with correct versions.

- pandas==1.3.5
- numpy==1.21.6
- scikit-learn==1.0.2
- torch==1.13.1
- transformers==4.30.2
- nltk==3.8.1

## Importing useful libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader
%matplotlib inline

2023-09-04 17:18:26.969177: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-04 17:18:38.538464: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-09-04 17:18:38.539825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Reading and Verifying data

In [3]:
news_df = pd.read_csv("WELFake_Dataset.csv")
print(news_df.shape)
news_df.head()

(72134, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### NULL value check

In [4]:
news_df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

- Very small number of NULL values
- Let's fill them with empty string value

In [5]:
news_df.fillna('', inplace=True)

#### Creating a new column "content" with combined 'title' and 'text' that we will use for model training and predicitons

In [6]:
news_df['content'] = [x + ' ' + y for x,y in zip(news_df.title, news_df.text)]

### converting the content to lowercase

In [7]:
news_df['content'] = news_df['content'].apply(lambda text: text.lower())

### Separating labels from text

In [8]:
texts = news_df.content.values
labels = news_df.label.values
print(len(texts), len(labels))

72134 72134


### Applying BERT Tokenization

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### preparing tokenized data and attention masks

In [10]:
def prepare_tokenized_data(texts, labs='None'):
    global labels
    input_id_list = []
    attention_masks = []
    for text in tqdm_notebook(texts):
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            truncation = 'longest_first',
            max_length = 100,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        input_id_list.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_id_list = torch.cat(input_id_list, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None':
        labels = torch.tensor(labels)
        return input_id_list, attention_masks, labels
    else:
        return input_id_list, attention_masks

In [11]:
input_id_list, attention_masks, labels = prepare_tokenized_data(texts, labels)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/72134 [00:00<?, ?it/s]



## Creating dataset and splitting into train/test

In [12]:
tensor_dataset = TensorDataset(input_id_list, attention_masks, labels)

# lets keep 80% articles for training and 20% for test
train_size = int(0.8 * len(tensor_dataset))
test_size = len(tensor_dataset) - train_size

train_data, test_data = random_split(tensor_dataset, [train_size, test_size])
len(train_data.indices), len(test_data.indices)

(57707, 14427)

## DataLoader Object for batching

In [13]:
batch_size = 32
num_workers = 4

train_data_loader = DataLoader(
    dataset=train_data,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
)
test_data_loader = DataLoader(
    dataset=test_data,
    batch_size=1,
    shuffle=False,
)

## Loading BERT pre-trained model

In [14]:
device = 'cpu'
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)
bert_model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Training BERT model

### setting optimizer

In [15]:
optimizer = torch.optim.AdamW(
    bert_model.parameters(),
    lr=6e-6,
    eps=1e-8,
)

### setting up training steps and Scheduler

In [16]:
num_epochs = 3
steps_per_epoch = len(train_data_loader)
total_steps = steps_per_epoch * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = total_steps,
)

### training BERT

In [17]:
bert_model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for i, (ids, masks, labels) in enumerate(train_data_loader):
        ids = ids.to(device)
        masks = masks.to(device)
        labels = labels.to(device)
        loss = bert_model(ids, token_type_ids=None, attention_mask=masks, labels=labels)[0]
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print('Epoch: {}, Loss: {:.4f}'.format(epoch+1, total_loss / steps_per_epoch))

Epoch: 1, Loss: 0.0803
Epoch: 2, Loss: 0.0229
Epoch: 3, Loss: 0.0112


### saving model

In [18]:
# save trained model locally
torch.save(bert_model.state_dict(), 'BERT.ckpt')

## Evaluation 

In [19]:
bert_model.eval()
bert_model.load_state_dict(
    torch.load('BERT.ckpt'),
)

<All keys matched successfully>

## Calculating Accuracy on Test data

In [20]:
correct_predictions = 0
predictions = []
reals = []
for i, (ids, masks, labels) in enumerate(test_data_loader):
    ids = ids.to(device)
    masks = masks.to(device)
    labels = labels.to(device)
    bert_out = bert_model(ids, token_type_ids=None, attention_mask=masks, labels=labels)[1]
    prediction = torch.max(bert_out, 1)[1][0].item()
    true_label = labels[0].item()
    correct_predictions += int(prediction == true_label)
    predictions.append(prediction)
    reals.append(true_label)
avg_correct_predictions = correct_predictions / len(test_data)
print('Accuracy: {:.4f}\n'.format(avg_correct_predictions))

Accuracy: 0.9902



## Confusion Matrix

In [21]:
print(confusion_matrix(reals, predictions,))

[[7025   53]
 [  88 7261]]


## Classification Report

In [22]:
print(
    classification_report(
        reals,
        predictions,
        target_names=['Real', 'Fake'],
    )
)

              precision    recall  f1-score   support

        Real       0.99      0.99      0.99      7078
        Fake       0.99      0.99      0.99      7349

    accuracy                           0.99     14427
   macro avg       0.99      0.99      0.99     14427
weighted avg       0.99      0.99      0.99     14427

