First, We will import necessary Python packages and modules required for various functionalities such as data handling, model building, evaluation metrics, and preprocessing.BERT fit for this because it understands language well and can accurately classify fake news by using pre-learned representations.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

Load the dataset from a CSV file and extract the relevant features ('cleaned_content' for input text and 'type' for target labels.

In [None]:
data = pd.read_csv('/kaggle/input/fake-binary-reclass/reclassified_1.csv')
X = data['cleaned_content']
y = data['type']  


Convert categorical target labels ('type') into numerical labels using LabelEncoder.

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

split the dataset into training and testing sets using a 80-20 split ratio.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


Use the pre-trained BERT tokenizer to tokenize and encode the input sequences into numerical format suitable for BERT model input.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 256  
train_encodings = tokenizer(list(X_train), truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
test_encodings = tokenizer(list(X_test), truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')


Convert the numerical labels into PyTorch tensors.And create PyTorch datasets using the tokenized input sequences, attention masks, and labels.

In [None]:
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)



Create data loaders for training and testing datasets to facilitate batch-wise processing during training and evaluation. We made batch size of 40, because model was taking so much time to train.

In [None]:
batch_size = 40
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset))


Load the pre-trained BERT model for sequence classification. The number of labels is set to the number of unique classes in the target variable. And freeze the parameters of the base BERT model to prevent them from being updated during training.

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

for param in model.base_model.parameters():
    param.requires_grad = False


Define the optimizer to only optimize the parameters of the final classification layer. And train the model using the training data and evaluate its performance on the testing data. 

In [None]:
optimizer = AdamW(model.classifier.parameters(), lr=2e-5, eps=1e-8)

# Define number of epochs
epochs = 3

# Total number of training steps
total_steps = len(train_loader) * epochs

# Create learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Iterate over epochs, batches, calculating loss, and updating model parameters during training, and making predictions and calculating evaluation metrics during testing.

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        model.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Training Loss: {avg_train_loss}')



Evaluate the model.

In [None]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].numpy()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        
        predictions.extend(preds)
        true_labels.extend(labels)

predictions = np.array(predictions)
true_labels = np.array(true_labels)



Calculate the evalutation metrics

In [None]:
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)


In [1]:
print("Evaluation Metrics for Advanced Model:")
print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)
print("Precision:", precision)

Evaluation Metrics for Advanced Model:
Accuracy: 0.69
Recall: 0.89
F1 Score: 0.77
Precision: 0.69
