In [None]:
!pip install transformers -q

In [None]:
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from pprint import  pprint
import re
from collections import Counter
import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Task3
train_data=pd.read_csv('train.tsv',sep='\t')
valid_data=pd.read_csv('valid.tsv',sep='\t')

train_data['label'].replace({"Lit-News_mentions":0, "Nonpersonal_reports":1, "Self_reports":2}, inplace=True)
valid_data['label'].replace({"Lit-News_mentions":0, "Nonpersonal_reports":1, "Self_reports":2}, inplace=True)

In [None]:
train_data

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased",do_lower_case=True)


In [None]:
train_enc = tokenizer.batch_encode_plus(train_data['tweet'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_enc = tokenizer.batch_encode_plus(valid_data['tweet'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

In [None]:
train_enc.keys()

In [None]:
tokenizer.decode(train_enc["input_ids"][3])

In [None]:
tokenizer.decode(train_enc["input_ids"][-3])

In [None]:
train_enc["attention_mask"]

In [None]:
train_enc["token_type_ids"]

In [None]:
train_enc["input_ids"]

In [None]:
train_input_ids, train_type_ids, train_attn_mask = train_enc.input_ids, train_enc.token_type_ids, train_enc.attention_mask
test_input_ids, test_type_ids, test_attn_mask = test_enc.input_ids, test_enc.token_type_ids, test_enc.attention_mask

In [None]:
train_input_ids.shape, train_type_ids.shape, train_attn_mask.shape

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Below we define a function to create train, test & valid dataloaders in Pytorch

batch_size = 32

def get_dataloader(input_ids, type_ids, attn_mask, y):
    data = TensorDataset(input_ids, type_ids, attn_mask, y)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return dataloader

train_datalaoder = get_dataloader(train_input_ids, train_type_ids, train_attn_mask, torch.tensor(train_data['label']))
test_datalaoder = get_dataloader(test_input_ids, test_type_ids, test_attn_mask, torch.tensor(valid_data['label']))

In [None]:
# Sanity check that the tensors returned by the dataloader are correct

for batch in train_datalaoder:
    input_ids, type_ids, attn_mask, y = batch
    print(input_ids.shape, type_ids.shape, attn_mask.shape, y.shape)
    break


In [None]:
y

In [None]:
class BERTClassifier(nn.Module):
  def __init__(self, transformer):
    super().__init__()
    shape=1024
    self.transformer = transformer
    self.linear = nn.Linear(shape, 4)
  
  def forward(self, in_ids, type_ids, attn_mask):
    op = self.transformer(input_ids=in_ids, attention_mask=attn_mask, 
                          token_type_ids=type_ids)
    
    return  self.linear(op["pooler_output"])

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [None]:

#model = BERTClassifier(transformer).to(device)
from transformers import BertForSequenceClassification, AdamW, BertConfig


transformer = AutoModel.from_pretrained("bert-large-uncased")


model = BERTClassifier(transformer).to(device)
model.cuda()


print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
model

In [None]:
for name, param in model.named_parameters():
  if "pooler" in name or "linear" in name:#or "layer.11" in name or "layer.10" in name or "linear" in name:
    param.requires_grad = True
  else:
    param.requires_grad = False
  print(name, param.shape, param.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
from tqdm import tqdm
for ix, batch in tqdm(enumerate(train_datalaoder)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, type_ids, attn_mask, y = batch
print(input_ids.shape)
print(type_ids)
print(attn_mask.shape)

In [None]:
# Training function: Performs forward propagation, backpropagation & optimization.
# We also implement gradient clipping, which prevents the gradients from exploding

def train(model, dataloader, optimizer, criterion, clip=1.0):

    model.train()

    ep_t_loss = 0
    batch_num  = 0
    pred, tgt = [], []
    for ix, batch in tqdm(enumerate(dataloader)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, type_ids, attn_mask, y = batch
        
        optimizer.zero_grad()
        output = model(input_ids, type_ids, attn_mask)     
        loss = criterion(output, y)
        loss.backward()

        #gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        ep_t_loss += loss.item()
        batch_num += 1
        pred.extend(torch.argmax(output, -1).tolist())
        tgt.extend(y.tolist())

    return ep_t_loss/batch_num, metrics.f1_score(tgt, pred, average='macro')

# Evaluation function: Calculates loss on the validation data.
from sklearn import metrics

def evaluate(model, dataloader, criterion):

    model.eval()

    ep_t_loss = 0
    batch_num  = 0
    pred, tgt = [], []
    for ix, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, type_ids, attn_mask, y = batch

        with torch.no_grad():
            output = model(input_ids, type_ids, attn_mask)
            
            loss = criterion(output, y)

            ep_t_loss += loss.item()
            batch_num += 1
            pred.extend(torch.argmax(output, -1).tolist())
            tgt.extend(y.tolist())
        
    return ep_t_loss/batch_num, metrics.f1_score(tgt, pred, average='macro'), pred, tgt

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr = 2e-5)

In [None]:
best_valid_loss = float('inf')
tot_t_loss, tot_v_loss =[],[]
N_EPOCHS = 12

In [None]:
import time
from tqdm import tqdm

for epoch in tqdm(range(N_EPOCHS)): 

    tr_l, tr_f1= train(model, train_datalaoder, optim, criterion)
    tot_t_loss.append(tr_l)

    val_l, val_f1, pred, tgt = evaluate(model, test_datalaoder, criterion)
    tot_v_loss.append(val_l)
    
    if val_l < best_valid_loss:
        best_valid_loss = val_l
        best_pred, best_tgt = pred, tgt
        torch.save(model.state_dict(), 'model_least_loss.pt')
        print("\nBest Model Saved !!")
    elif epoch % 3 == 0:
        torch.save(model.state_dict(), 'model_checkpoint_'+str(epoch)+'.pt')
        print("\Checkpoint Model Saved !!")
    print("\n")
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Total Loss: {tr_l:.3f} | Train F1: {tr_f1:.3f}')
    print(f'\tVal. Total Loss: {val_l:.3f} | Valid F1: {val_f1:.3f}')
    print("_________________________________________________________________")

In [None]:
print(metrics.classification_report(best_tgt, best_pred))

Bert-Based-Uncased 12 Epochs - 0.95

Bert-Large-Uncased 12 Epochs - 0.94 

bertweet-covid19-base-uncased 12 epochs - 0.96

digitalepidemiologylab/covid-twitter-bert 12 epochs - 0.97

COVID-SciBERT 12 epochs - 0.97


