In [1]:
!pip install transformers
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix
import numpy as np 
import pandas as pd 
import random
import os
import json
from tqdm.notebook import tqdm
torch.cuda.empty_cache()



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
seed = 100
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
EPOCH = 3 
BATCH_SIZE = 16
LEARNING_RATE = 1e-5
STEPS = 0
LENGTH = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trans_model = 'bert-base-uncased'

In [4]:
df = pd.read_table('/content/drive/MyDrive/bert/train.tsv')
dfTest = pd.read_table('/content/drive/MyDrive/bert/test.tsv')
dfSub = pd.read_csv('/content/drive/MyDrive/bert/sampleSubmission.csv')

In [5]:
def boilerplatePreproc(data):
    jsonData = json.loads(data)
    try:
        x = jsonData['body']
        x = ' '.join(x.split(' ')[-400:])
    except:
        x = jsonData['title']
        
    return x
df['body'] = df.boilerplate.map(boilerplatePreproc)
df = df.dropna()
dfTest['body'] = dfTest.boilerplate.map(boilerplatePreproc)

In [6]:
x_train, x_val, y_train, y_val = train_test_split(df.body.values, df.label.values, test_size=0.15, random_state=seed, stratify=df.label.values)
tokenizer = BertTokenizer.from_pretrained(trans_model, do_lower_case=True)
encoded_train_data = tokenizer.batch_encode_plus(
    list(x_train), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length',
    max_length=LENGTH, 
    return_tensors='pt',
    truncation=True
)

encoded_val_data = tokenizer.batch_encode_plus(
    list(x_val), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=LENGTH, 
    return_tensors='pt',
    truncation=True
)
train_input_ids = encoded_train_data['input_ids']
attention_masks_train = encoded_train_data['attention_mask']
train_labels = torch.tensor(list(y_train))
val_input_ids = encoded_val_data['input_ids']
attention_masks_val = encoded_val_data['attention_mask']
val_labels = torch.tensor(list(y_val))
train_dataset = TensorDataset(train_input_ids, attention_masks_train, train_labels)
val_dataset = TensorDataset(val_input_ids, attention_masks_val, val_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [7]:
def helper():
    model = BertForSequenceClassification.from_pretrained(trans_model)
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=2e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=STEPS, num_training_steps=len(train_dataloader) * EPOCH)  
    return model, optimizer, scheduler

In [8]:
def evaluate():
    model.eval()
    total_val_loss = 0
    predictions, true_vals = [], []
    for batch in val_dataloader: 
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }
        with torch.no_grad():        
            outputs = model(**inputs)  
        loss = outputs[0]
        logits = outputs[1]
        total_val_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)   
    avg_val_loss = total_val_loss/len(val_dataloader)   
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)           
    return avg_val_loss, predictions, true_vals

In [9]:
def trainer(EPOCH):
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc='EPOCH {:1d}'.format(EPOCH), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }       

        outputs = model(**inputs)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.float().backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})   
    torch.save(model.state_dict(), f'finetuned_BERT_EPOCH_{EPOCH}.pth')
    tqdm.write(f'\nEPOCH {EPOCH}')
    avg_train_loss = total_train_loss/len(train_dataloader)            
    tqdm.write(f'Training loss: {avg_train_loss}')
    val_loss, predictions, true_vals = evaluate()
    

In [10]:
model, optimizer, scheduler = helper()
for i in tqdm(range(1, EPOCH+1)):
    trainer(i)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='EPOCH 1', max=393.0, style=ProgressStyle(description_widt…


EPOCH 1
Training loss: 0.4730224645016454


HBox(children=(FloatProgress(value=0.0, description='EPOCH 2', max=393.0, style=ProgressStyle(description_widt…


EPOCH 2
Training loss: 0.3978565185882966


HBox(children=(FloatProgress(value=0.0, description='EPOCH 3', max=393.0, style=ProgressStyle(description_widt…


EPOCH 3
Training loss: 0.32468659112244164



In [11]:
def predLabels(data):
    encoded_data = tokenizer.batch_encode_plus(
        data, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        max_length=LENGTH, 
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']   
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=1)   
    model.eval()   
    predictions = []
    
    for batch in dataloader:      
        batch = tuple(b.to(device) for b in batch)        
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
        }
        with torch.no_grad():        
            outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)   
    predictions = np.concatenate(predictions, axis=0)           
    return np.argmax(predictions, axis=1).flatten()

In [12]:
x_val_list = list(x_val)
y_val_list = list(y_val)
pred = predLabels(x_val_list)

In [13]:
print(classification_report(y_val_list, pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       540
           1       0.83      0.77      0.80       570

    accuracy                           0.80      1110
   macro avg       0.80      0.80      0.80      1110
weighted avg       0.80      0.80      0.80      1110



In [14]:
preds = predLabels(dfTest.body.to_list())

subDict = {
    'urlid': dfTest.urlid.to_list(),
    'label': preds,
}

In [15]:
submission_df = pd.DataFrame.from_dict(subDict)
submission_df.to_csv('submission.csv', index = False)