# CARE-BERT: **C**lickb**a**it Detecto**r** using Self-attentiv**e** Network with **B**idirectional **E**ncoder **R**epresentations from **T**ransformers

> ## EECS 498-004 Introduction to Natural Language Processing Course Project

# Baseline 2: Headline Bi-LSTM with BERT embeddings

##  **0 - Setup**

In [None]:
import pandas as pd
import numpy as np
import os

dir = '/content/drive/Shareddrives/EECS 498-004 NLP Project - Clickbait/data/clickbait17/'

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 9.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 46.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=2532eb2f55e

In [None]:
# from transformers import pipeline; 
# print(pipeline('sentiment-analysis')('we love you'))

## **1 - Data Corpus**


*   data = 19538




In [None]:
class Webis17:
    '''
    self.corpus: (post, text, truthMean)
    '''
    def __init__(self, path):
        self.train_file = path + 'instances.jsonl'
        self.truth_file = path + 'truth.jsonl'
        df_train = pd.read_json(self.train_file, lines=True)
        df_truth = pd.read_json(self.truth_file, lines=True)
        self.size = df_train.shape[0]

        truth_id, truth_mean = list(df_truth['id']), list(df_truth['truthMean'])
        truth_dict = {truth_id[i]:truth_mean[i] for i in range(self.size)}
        train_id, train_post, train_text = list(df_train['id']), list(df_train['postText']), list(df_train['targetParagraphs'])
        #? train_post[i] is a list
        self.corpus = [(train_post[i][0], ' '.join(para for para in train_text[i]), truth_dict[train_id[i]]) for i in range(self.size)]

        # print(self.corpus[:10])

    

In [None]:
# web17 = Webis17('./data/clickbait17/')
dir = '/content/drive/Shareddrives/EECS 498-004 NLP Project - Clickbait/data/clickbait17/'
web17 = Webis17(dir)
num_data = len(web17.corpus)
print(num_data)

19538


In [None]:
print(web17.corpus[0])

('UK’s response to modern slavery leaving victims destitute while abusers go free', 'Thousands of modern slavery victims have\xa0not come forward, while others who have chosen to report their enslavers have ended up destitute as a result of insufficient support, say\xa0MPs “Inexcusable” failures in the UK’s system for dealing with modern slavery are\xa0leaving victims reduced to destitution while their abusers go free because they are not adequately supported to testify against them, an alarming report has warned. Thousands of\xa0victims\xa0have not come forward, while others who have chosen to give evidence against their enslavers have ended up destitute as a result of insufficient support, according to\xa0a report published\xa0today by\xa0the Work and Pensions Committee. It is estimated there are between 10,000 and 13,000 victims of modern slavery in the UK, but the report\xa0warns that failings in the current mechanism mean\xa0that once they are identified, they have no automatic fo

## **2 - Dataset Preprocessing - BERT Embedding**

### Download BERT

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

bert_tokenizer.save_pretrained(dir+'bert-base-uncased')
bert_model.save_pretrained(dir+'bert-base-uncased')

# it turns out that bert has limited token length of 512

### Load BERT

In [None]:
## load from files & tokenizer analysis

from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained(dir+'bert-base-uncased')
bert_model = BertModel.from_pretrained(dir+'bert-base-uncased')

# encode1 = torch.tensor(bert_tokenizer.encode(web17.corpus[0][0]))
# encode2 = torch.tensor(bert_tokenizer.encode(web17.corpus[1][0]))
# encode3 = torch.tensor(bert_tokenizer.encode(web17.corpus[2][0]))
# encodeAll = bert_tokenizer([web17.corpus[0][0], web17.corpus[1][0],web17.corpus[2][0]], padding=True,return_token_type_ids=False, return_attention_mask=False)['input_ids']

# encodeAll_crop = bert_tokenizer([web17.corpus[0][0], web17.corpus[1][0],web17.corpus[2][0]], padding=True, truncation=True,max_length=20,return_token_type_ids=False, return_attention_mask=False)['input_ids']


# print(encode1.shape)
# print(encode2.shape)
# print(encode3.shape)
# print([len(lst)  for lst in encodeAll])
# print([len(lst)  for lst in encodeAll_crop])


# print(encode1)
# print(encodeAll[0])
# print(encodeAll_crop[0])

# print(encode3)
# print(encodeAll[2])
# print(encodeAll_crop[2])

### data profiling

In [None]:
## extract data
title_all = [data[0] for data in web17.corpus]
content_all = [data[1] for data in web17.corpus]
score_all = torch.tensor([data[2] for data in web17.corpus], requires_grad=True)

In [None]:
# title profiling

title_all_tokenized_raw = bert_tokenizer(title_all,return_token_type_ids=False, return_attention_mask=False)['input_ids']
print(max([len(lst) for lst in title_all_tokenized_raw ]))
print(f"Average # of tokens = {np.mean([len(lst) for lst in title_all_tokenized_raw])}")
print(f"max # of tokens = {max([len(lst) for lst in title_all_tokenized_raw])}")
print(f"ID of title with max # of tokens = {np.argmax([len(lst) for lst in title_all_tokenized_raw ])}")
print("---the title---")
print(title_all[np.argmax([len(lst) for lst in title_all_tokenized_raw ])])
print("---the title---")

104
Average # of tokens = 17.628058143105743
max # of tokens = 104
ID of title with max # of tokens = 16508
---the title---
................
................
................
................
................
................
Okay, then...
---the title---


In [None]:
# content profiling
content_all_tokenized_raw = bert_tokenizer(content_all,return_token_type_ids=False, return_attention_mask=False)['input_ids']
print(f"Average # of tokens = {np.mean([len(lst) for lst in content_all_tokenized_raw])}")

Average # of tokens = 791.2599037772546


In [None]:
print(f"max # of tokens = {max([len(lst) for lst in content_all_tokenized_raw])}")

max # of tokens = 43357


### extract embeddings & divide train/val/test set

#### Raw

In [None]:
# All embeddings
title_all_tokenized = bert_tokenizer(title_all, padding=True,truncation=True,max_length=20, return_token_type_ids=False, return_attention_mask=False, return_tensors="pt")['input_ids']
print(title_all_tokenized.shape)
print(title_all_tokenized)
torch.save(title_all_tokenized, dir+'titles_tokens.pt')

torch.Size([19538, 20])
tensor([[  101,  2866,  1521,  ...,  2489,   102,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  1996,  1000,  ...,  1996,  2047,   102],
        ...,
        [  101,  2413,  2015,  ...,  2112,  1997,   102],
        [  101,  2821,  5076,  ...,     0,     0,     0],
        [  101,  2957, 11011,  ...,     0,     0,     0]])


In [None]:
train_size = 700
val_size = 100
outputs = bert_model(title_all_tokenized[:(train_size+val_size), :])
title_all_embed = outputs[0]  # The last hidden-state is the first element of the output tuple
print(title_all_embed.shape) # batchsize x # tokens of sent x embed_dim

torch.Size([800, 20, 768])


In [None]:
## tokenize paragraphs -> longformer

# from transformers import LongformerModel, LongformerTokenizer
# long_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
# long_model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [None]:
# content_all_tokenized = bert_tokenizer(content_all, padding=True,return_token_type_ids=False, return_attention_mask=False, return_tensors="pt")['input_ids']
# print(content_all_tokenized.shape)

# outputs = bert_model(content_all_tokenized)
# content_all_embed = outputs[0]
# torch.save(content_all_embed, dir+'/contents.pt')

#### Process by patches

In [None]:
import torch
title_all_tokenized = torch.load(dir+'titles_tokens.pt')
print(title_all_tokenized.shape)

torch.Size([19538, 20])


In [None]:
import gc

num_data = 19538
extract_size = 800
for i in range(num_data//800):
    outputs = bert_model(title_all_tokenized[(extract_size*i):(extract_size*(i+1)), :])
    title_all_embed = outputs[0]  # The last hidden-state is the first element of the output tuple
    print(title_all_embed.shape) # batchsize x # tokens of sent x embed_dim
    print(f"From size {str(extract_size*i)} to {str(extract_size*(i+1))}")
    # save Data
    torch.save(title_all_embed, dir+'/titles_'+str(extract_size*i)+'_'+str(extract_size*(i+1)))
    del outputs
    del title_all_embed
    gc.collect()

torch.Size([800, 20, 768])
From size 0 to 800
torch.Size([800, 20, 768])
From size 800 to 1600
torch.Size([800, 20, 768])
From size 1600 to 2400
torch.Size([800, 20, 768])
From size 2400 to 3200
torch.Size([800, 20, 768])
From size 3200 to 4000
torch.Size([800, 20, 768])
From size 4000 to 4800
torch.Size([800, 20, 768])
From size 4800 to 5600
torch.Size([800, 20, 768])
From size 5600 to 6400
torch.Size([800, 20, 768])
From size 6400 to 7200
torch.Size([800, 20, 768])
From size 7200 to 8000
torch.Size([800, 20, 768])
From size 8000 to 8800
torch.Size([800, 20, 768])
From size 8800 to 9600
torch.Size([800, 20, 768])
From size 9600 to 10400
torch.Size([800, 20, 768])
From size 10400 to 11200
torch.Size([800, 20, 768])
From size 11200 to 12000
torch.Size([800, 20, 768])
From size 12000 to 12800
torch.Size([800, 20, 768])
From size 12800 to 13600
torch.Size([800, 20, 768])
From size 13600 to 14400
torch.Size([800, 20, 768])
From size 14400 to 15200
torch.Size([800, 20, 768])
From size 15200

In [None]:
# last portion

num_patchs = num_data//extract_size
outputs = bert_model(title_all_tokenized[(extract_size*num_patchs):, :])
title_all_embed = outputs[0]  # The last hidden-state is the first element of the output tuple
print(title_all_embed.shape) # batchsize x # tokens of sent x embed_dim
print(f"From size {str(extract_size*num_patchs)} to {str(num_data)}")
# save Data
torch.save(title_all_embed, dir+'/titles_'+str(extract_size*num_patchs)+'_'+str(num_data))

del outputs
del title_all_embed
gc.collect()

torch.Size([338, 20, 768])
From size 19200 to 19538


501

#### Combine

In [None]:
Xt = torch.zeros(num_data, 20, 768)
for i in range(num_data//800):
    # curr_Xt = torch.load(dir+'/titles_'+str(extract_size*i)+'_'+str(extract_size*(i+1)))
    Xt[extract_size*i:extract_size*(i+1), :,: ] = torch.load(dir+'/titles_'+str(extract_size*i)+'_'+str(extract_size*(i+1)))
Xt[extract_size*num_patchs:,:,:] = torch.load(dir+'/titles_'+str(extract_size*num_patchs)+'_'+str(num_data))

print(Xt.shape)
# print(Xt[-10:,:,:])


torch.Size([19538, 20, 768])


In [None]:
torch.save(Xt, dir+'/titles_all.pt')

##**3 - Load Data: Ready for Training!**

### All (20 tokens)

In [None]:
# load data
import torch
from torch.utils.data import TensorDataset, DataLoader

dir = '/content/drive/Shareddrives/EECS 498-004 NLP Project - Clickbait/data/clickbait17/'
Xt_all = torch.load(dir+'/titles_all.pt')
yt_all = torch.load(dir+'/scores.pt')
print(Xt_all.shape)
print(yt_all.shape)

num_data = Xt_all.shape[0]
train_size = 16000
val_size = 2000
test_size = num_data - train_size - val_size
batch_size = 64
train_set = TensorDataset(Xt_all[:train_size,:,:], yt_all[:train_size])
val_set = TensorDataset(Xt_all[train_size:train_size+val_size,:,:], yt_all[train_size:train_size+val_size])
test_set = TensorDataset(Xt_all[train_size+val_size:,:,:], yt_all[train_size+val_size:])

train_dataloader = DataLoader(train_set, batch_size=batch_size)
val_dataloader = DataLoader(val_set, batch_size=batch_size)
test_dataloader = DataLoader(test_set, batch_size=batch_size)

torch.Size([19538, 20, 768])
torch.Size([19538])


### Only [CLS]

In [None]:
# load data
from torch.utils.data import TensorDataset, DataLoader

dir = '/content/drive/Shareddrives/EECS 498-004 NLP Project - Clickbait/data/clickbait17/'
Xt_all = torch.load(dir+'/titles_all.pt')
yt_all = torch.load(dir+'/scores.pt')
print(Xt_all.shape)
print(yt_all.shape)

num_data = Xt_all.shape[0]
train_size = 16000
val_size = 2000
test_size = num_data - train_size - val_size
batch_size = 64
train_set = TensorDataset(Xt_all[:train_size,0,:], yt_all[:train_size])
val_set = TensorDataset(Xt_all[train_size:train_size+val_size,0,:], yt_all[train_size:train_size+val_size])
test_set = TensorDataset(Xt_all[train_size+val_size:,0,:], yt_all[train_size+val_size:])

train_dataloader = DataLoader(train_set, batch_size=batch_size)
val_dataloader = DataLoader(val_set, batch_size=batch_size)
test_dataloader = DataLoader(test_set, batch_size=batch_size)

torch.Size([19538, 20, 768])
torch.Size([19538])


## **4 - Model 1 - Simple LSTM**

### Model Architecture


In [None]:
import torch
import torch.nn as nn
import numpy as np

class LSTM(nn.Module):
    def __init__(self, batch_size, num_tokens, embed_dim, hidden_dim,  n_layers = 1, dropout = 0.0):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm=nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.flatten = nn.Flatten(1)
        # self.fc1=nn.Linear(num_tokens*hidden_dim, 64)
        # self.fc1=nn.Linear(num_tokens*hidden_dim, 1)
        # take CLS token, birection
        self.fc1=nn.Linear(2*hidden_dim, 64)

        self.fc2=nn.Linear(64, 1)
        
    def forward(self, x, hidden):
        '''
            x: batch_size x num_tokens x embed_dim
        '''
        # take CLS token
        # print(x[:,0,:].unsqueeze(1).shape)
        lstm_out, hidden = self.lstm(x.unsqueeze(1), hidden) # batch_size x 1 x (2*hidden_dim)

        # flat = self.flatten(lstm_out) 
        flat = lstm_out.squeeze() # batch_size x hidden_dim

        out1 = self.fc1(flat) # batch_size x 64
        out2 = self.fc2(torch.relu(out1)) # batch_size x 1
        out = torch.sigmoid(out2)

        # # single layer
        # out = torch.sigmoid(out1)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        # birections -> *2
        hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

def init_weights(m):
    '''
    Initialize weights
    '''
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.0)

In [None]:
# load GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
print(torch.cuda.get_device_name(0))

Using cuda device
Tesla T4


### Hyperparamters

In [None]:


hidden_dim = 10 # num of tokens is typically 20
_ , num_tokens, embed_dim = Xt_all.shape
# dropout = 0.0
dropout = 0.2

model = LSTM(batch_size, num_tokens, embed_dim, hidden_dim, n_layers=2, dropout = dropout).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

from torch.optim.lr_scheduler import ReduceLROnPlateau # learning rate scheduler
lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.25, patience=0, threshold=0.05,min_lr=3e-5, verbose=True)

model.apply(init_weights)



LSTM(
  (lstm): LSTM(768, 10, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=20, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

### Training & Testing

#### Helper functions

In [None]:
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

### Training ###
def train(train_dataloader, y_truth, model, loss_fn, optimizer, mute = False):
    model.train()

    size = len(train_dataloader.dataset)

    y_pred_train = []
    for batch, (X, y) in enumerate(train_dataloader):
        hidden = model.init_hidden(X.shape[0])
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()

        # Compute prediction error
        pred, hidden = model(X, hidden)
        y_pred_train.extend(pred.squeeze().cpu())
        loss = loss_fn(pred.squeeze(), y)
        # Backpropagation

        loss.backward()
        optimizer.step()

        if batch % 20 == 0:
            loss, current = loss.item(), batch * len(X)
            if not mute:
                print(f"Training loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    y_pred_train = torch.tensor(y_pred_train, dtype=float)
    performance = loss_fn(y_pred_train, y_truth)
    clf_performance = ((y_pred_train>0.5)==(y_truth>0.5)).float().mean()

    if not mute:
        print(f"Training Loss: {performance}")
        print(f"Training Classifier Accuracy: {clf_performance}")
    return y_pred_train

### Testing ###
def test(val_dataloader, y_truth, model, loss_fn, lr_scheduler, mute = False, mode = 0):
    '''
    mode = 0: validation when training (lr_scheduler)
    mode = 1: validation
    mode = 2: test
    '''
    hidden_val = model.init_hidden(batch_size)
    model.eval()

    y_pred_val = []
    for batch, (X, y) in enumerate(val_dataloader):
        hidden_val = model.init_hidden(X.shape[0])
        X, y = X.to(device), y.to(device)

        pred, hidden_val = model(X, hidden_val)
        y_pred_val.extend(pred.squeeze().cpu())

    y_pred_val = torch.tensor(y_pred_val, dtype=float)
    performance = loss_fn(y_pred_val, y_truth)
    if mode == 0:
        lr_scheduler.step(performance)
    clf_performance = ((y_pred_val>0.5)==(y_truth>0.5)).float().mean()

    f1_performance = f1_score((y_pred_val>0.5).float().numpy(), (y_truth>0.5).float().numpy())
    p_performance = pearsonr(y_pred_val.detach().numpy(), y_truth.detach().numpy())[0]
    if not mute:
        if mode == 2:
            print(f"Test Loss: {performance}")
            print(f"Test Accuracy: {clf_performance}")
            print(f"Test F1 Score: {f1_performance}")
            print(f"Test Pearson Coefficient: {p_performance}")
        else:
            print(f"Validation Loss: {performance}")
            print(f"Validation Accuracy: {clf_performance}")
            print(f"Validation F1 Score: {f1_performance}")
            print(f"Test Pearson Coefficient: {p_performance}")

    return performance

#### On-Going

In [None]:
## Training & validation

### ESTIMATED TIME: 2
# num * 20 * 768 -> 1 min per batch -> 2 hr per epoch
# CLS -> num * 1 * 768, hidden = 10, bidirectional -> 8 min per epoch
###


epochs = 50
model.train()

best_val_performance = 1.0 # any number works
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, yt_all[:train_size], model, loss_fn, optimizer)
    val_performance = test(val_dataloader, yt_all[train_size:train_size+val_size], model, loss_fn, lr_scheduler)

    if val_performance < best_val_performance:
        best_val_performance = val_performance
        print(f'NEW BEST MODEL! Performance: {best_val_performance}')
        torch.save(model, dir+'/best_model')
print("Done!")

Epoch 1
-------------------------------
Training loss: 0.032313  [    0/16000]
Training loss: 0.021919  [ 1280/16000]
Training loss: 0.030513  [ 2560/16000]
Training loss: 0.032846  [ 3840/16000]
Training loss: 0.026816  [ 5120/16000]
Training loss: 0.022634  [ 6400/16000]
Training loss: 0.018584  [ 7680/16000]
Training loss: 0.033550  [ 8960/16000]
Training loss: 0.020973  [10240/16000]
Training loss: 0.025444  [11520/16000]
Training loss: 0.028835  [12800/16000]
Training loss: 0.031509  [14080/16000]
Training loss: 0.023814  [15360/16000]
Training Loss: 0.029822561029702536
Training Classifier Accuracy: 0.8504999876022339
Epoch     2: reducing learning rate of group 0 to 7.5000e-05.
Validation Loss: 0.032548400412765674
Validation Accuracy: 0.847000002861023
NEW BEST MODEL! Performance: 0.032548400412765674
Epoch 2
-------------------------------
Training loss: 0.029771  [    0/16000]
Training loss: 0.022390  [ 1280/16000]
Training loss: 0.027395  [ 2560/16000]
Training loss: 0.03094

In [None]:
###
### Naming Rules: <>
###

# torch.save(model, dir+'model_20')

# CLS, num * 1 * 768, hidden = 10, bidirectional -> 8 min per epoch
torch.save(model, dir+'model_CLS_10_bi')


In [None]:
import torch

dir = '/content/drive/Shareddrives/EECS 498-004 NLP Project - Clickbait/data/clickbait17/'

hidden_dim = 10 # num of tokens is typically 20
_ , num_tokens, embed_dim = Xt_all.shape
# dropout = 0.0
dropout = 0.2

model = LSTM(batch_size, num_tokens, embed_dim, hidden_dim, n_layers=2, dropout = dropout).to(device)
model = torch.load(dir+'/best_model')

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

from torch.optim.lr_scheduler import ReduceLROnPlateau # learning rate scheduler
lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.25, patience=0, threshold=0.05,min_lr=3e-5, verbose=True)


In [None]:
_ = test(val_dataloader, yt_all[train_size:train_size+val_size], model, loss_fn, lr_scheduler, mode = 1)
_ = test(test_dataloader, yt_all[train_size+val_size:], model, loss_fn, lr_scheduler, mode = 2)


Validation Loss: 0.03254840028757948
Validation Accuracy: 0.847000002861023
Validation F1 Score: 0.6592427616926503
Test Pearson Coefficient: 0.7251489583277191
Test Loss: 0.03121176758478534
Test Accuracy: 0.8582574725151062
Test F1 Score: 0.6736526946107785
Test Pearson Coefficient: 0.731791129078822
