In [None]:
# Basics
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import gc

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification
from torch.cuda.amp import GradScaler, autocast

# Utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time

# Check files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Check dataset

In [None]:
df_real = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

data_real = df_real.text.values.tolist()
labels_real = [0 for el in data_real]
print(f'Real samples: {len(data_real)}')


data_fake = df_fake.text.values.tolist()
labels_fake = [1 for el in data_fake]

print(f'Fake samples: {len(data_fake)}')

data = data_real.copy()
labels = labels_real.copy()

data.extend(data_fake)
labels.extend(labels_fake)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25)

print(f'train samples: {len(X_train)}')
print(f'test samples: {len(X_test)}')

## Tokenize and build DataLoader objects

In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check where the PAD token is
print('Special tokens indices')
print(f'{tokenizer.pad_token}: {tokenizer.convert_tokens_to_ids(tokenizer.pad_token)}')
print(f'{tokenizer.unk_token}: {tokenizer.convert_tokens_to_ids(tokenizer.unk_token)}')

In [None]:
# Build Data loaders for training/testing
def collate_batch(batch):
    label_list, text_list = [], []
    for (_labels, _text) in batch: 
        processed_text = torch.tensor(tokenizer.encode(_text, add_special_tokens=True))
        text_list.append(processed_text)
        label_list.append(_labels)
    return pad_sequence(text_list, padding_value=0.)[:512], torch.tensor(label_list)

train_loader = DataLoader(list(zip(y_train, X_train)), batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(list(zip(y_test, X_test)), batch_size=32, shuffle=True, collate_fn=collate_batch)

## Load model and set training layers

In [None]:
# Load model. This is just one of the simple ones
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Freeze the model
for param in model.parameters(): 
    param.requires_grad = False

# Let last layer to train
model.classifier = nn.Linear(in_features=768, out_features=2, bias=True)

# Send to gpu
device = torch.device('cuda')
_ = model.to(device)

## Training

In [None]:
learning_rate = 2e-3
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
epochs = 3
criterion = nn.CrossEntropyLoss()

train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []

test_max = 20

for epoch in range(epochs): 
    
    correct_predictions = 0
    train_size = 0
    running_loss = 0
    step = 0
    
    for x, y in tqdm(train_loader, desc=f'Epoch {epoch} train', leave=False):
        
        model.train() 
        
        x = x.to(device)
        y = y.to(device)
        
        
        optimizer.zero_grad()

        scores = model(x.T)[0]
        loss = criterion(scores, y)
        
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            _, predictions = scores.max(1)
            correct_predictions +=(predictions == y).sum()
            train_size += predictions.size(0)
            
        running_loss += loss.item()
        step += 1
        
        # These two lines, allegedly, make it more memory efficient
        del x, y, scores, loss 
        gc.collect()
         
    # Save train loss and accuracy
    train_loss.append(running_loss/step)
    train_accuracy.append(correct_predictions/train_size)
        
    ## Eval
    # Just for some batches, to see it is not increasing, to save a bit of time.
    # We get the accuracy and loss of the full test dataset below.
    model.eval()
    with torch.no_grad():
        correct_predictions_test = 0
        test_size = 0
        running_test_loss = 0
        step = 0
        test_it = 0
        
        for x, y in tqdm(test_loader, desc=f'Epoch {epoch} test', leave=False): 
            
            x= x.to(device)
            scores = model(x.T)[0]
            
            del x
            gc.collect()
            
            y = y.to(device)
            loss = criterion(scores, y)
            _, predictions = scores.max(1)
            correct_predictions_test +=(predictions == y).sum()
            test_size += predictions.size(0)
            running_test_loss += loss.item()
            step += 1
            test_it += 1
            if test_it > test_max:
                break
            
        
        # Save test loss and accuracy
        test_loss.append(running_test_loss/step)
        test_accuracy.append(correct_predictions_test/test_size)
        
        
        # Print
        print(f'Epoch {epoch}')
        print(f'\n   Train loss: {train_loss[-1]:.4f}      Train accuracy: {train_accuracy[-1]:.4f}')
        print(f'\n   Test loss: {test_loss[-1]:.4f}      Test accuracy: {test_accuracy[-1]:.4f}\n\n')
        time.sleep(2)
        

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))

ax1.plot(range(len(train_loss)), train_loss, label='Train loss')
ax1.plot(range(len(test_loss)), test_loss, label='Test loss')
ax1.legend()

ax2.plot(range(len(train_accuracy)), train_accuracy, label='Train accuracy')
ax2.plot(range(len(test_accuracy)), test_accuracy, label='Test accuracy')
ax2.legend();

In [None]:
# Accuracy and loss of the full test set
model.eval()
with torch.no_grad():
    correct_predictions_test = 0
    test_size = 0
    running_test_loss = 0
    step = 0
    test_it = 0

    for x, y in tqdm(test_loader, desc=f'Test', leave=False): 

        x= x.to(device)
        scores = model(x.T)[0]

        del x
        gc.collect()

        y = y.to(device)
        loss = criterion(scores, y)
        _, predictions = scores.max(1)
        correct_predictions_test +=(predictions == y).sum()
        test_size += predictions.size(0)
        running_test_loss += loss.item()
        step += 1
        test_it += 1

In [None]:
print(f'Full test loss: {running_test_loss/step:.4f}\nFull test accuracy: {correct_predictions_test/test_size:.4f}')