In [2]:
!pip install --upgrade transformers



In [3]:
import os
import random

import numpy as np
import torch

SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [1]:
import pandas as pd

df = pd.read_csv("../data/leetcode.csv")

df.sample(10)

Unnamed: 0,description,Array,Dynamic Programming,String,Math,Tree,Depth-first Search,Greedy,Hash Table,Binary Search,...,Random,Dequeue,Binary Search Tree,Suffix Array,Rolling Hash,Reservoir Sampling,Rejection Sampling,Memoization,OOP,Meet in the Middle
587,"Given two integers `n` and `k`, you need to co...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1558,"You are given `coordinates`, a string that rep...",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,"Given two strings `word1` and `word2`, return ...",0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1401,You are given two strings `a` and `b` of the s...,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
255,Given an array of integers `citations` where `...,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1561,There is a donuts shop that bakes donuts in ba...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1131,Given `head` which is a reference node to a si...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1071,Given an integer array `arr` and an integer `d...,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1111,"On a 2D plane, there are `n` points with integ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
508,"Given a string `s`, reverse the order of chara...",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from transformers import AutoTokenizer

# using santacoder
bert_tokenizer = AutoTokenizer.from_pretrained("bigcode/santacoder")
bert_tokenizer.pad_token = bert_tokenizer.eos_token

In [16]:
inputs = df['description'].values
labels = df.iloc[:, 1:].values
print(inputs.shape, labels.shape)

(1571,) (1571, 43)


In [17]:
MAX_LEN = 512

In [21]:
tokenized_inputs = bert_tokenizer(
    inputs.tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
input_ids = tokenized_inputs['input_ids']
attention_masks = tokenized_inputs['attention_mask']

# Print sentence 0, now as a list of IDs.
print('Original: ', tokenized_inputs['input_ids'][0])
print('* Token IDs:', tokenized_inputs['attention_mask'][0])
print('* Tokenized:', bert_tokenizer.decode(tokenized_inputs['input_ids'][0]))
print('* Attention_mask', tokenized_inputs['attention_mask'][0])

Original:  tensor([14894,   743,  1702,   457, 15304,   924, 10274,    63,   516,   743,
         5753,   924,  1466,  2415,   363,  6161,   457,   331,  4003,  6781,
         5869,   954,  5169,  1030,  1738,   404,   924,  1466,  4096,   185,
          185,  5815,  1437, 12704,   954,  2433,  1142,  6524,  1935, 15608,
         1874,  7805,    11,   516,  1481,  1437,   575,   855,   331,  3161,
         1586, 20301,    13,   185,   185,  5815,  1374,   363,   331,  6828,
          319,  1961,  2247,    13,   479,   185,  5719,   207,    16,    25,
          185,  1366,    25,  8675,   256,   436,    17,    11,    22,    11,
           16,    16,    11,    16,    20,   485,  1445,   256,   207,    24,
          185,  2292,    25,   436,    15,    11,    16,    60,   185,  2292,
           25, 25321,  8675,    58,    15,    60,   385,  8675,    58,    16,
           60,   515,   207,    24,    11,  1486,   363,   436,    15,    11,
          207,    16,   854,   185,   185,  5719,   2

In [22]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [23]:
def train_valid_split(input_ids, attention_masks, labels, batch_size = 16):
    train_inpts, valid_inpts, train_masks, valid_masks, train_labels, valid_labels = train_test_split(
        input_ids, attention_masks, labels, random_state=SEED, test_size=0.2
    )
    print(f'example train_input: {train_inpts[0]}')
    print(f'example train_mask: {train_masks[0]}')
    
    train_labels = torch.tensor(train_labels)
    valid_labels = torch.tensor(valid_labels)
    
    # dataloader 
    train_data = TensorDataset(train_inpts, train_masks, train_labels)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    validation_data = TensorDataset(valid_inpts, valid_masks, valid_labels)
    validation_dataloader = DataLoader(validation_data, shuffle=True, batch_size=batch_size)
    
    return train_dataloader, validation_dataloader

In [24]:
bert_train_dataloader, bert_validation_dataloader = train_valid_split(
    input_ids = torch.tensor(input_ids),
    attention_masks= torch.tensor(attention_masks),
    labels=labels,
    batch_size=16
)

example train_input: tensor([14894,   373,  1563,   370,    11,  2147,   331,  1541,   457,  4843,
         3329,    12,  3311, 21835,   557,   412,   276,  1231, 14129,   319,
          370,    11,   516,   363,   954,  1541, 33486,   924,    16,    15,
           61,    24,   385,   207,    22,  4096,   185,   185,    32,  1231,
         5143,   457,   373,  1563,   370,   438, 18921,   835, 20511,   207,
           15,   469,  2695,  6864,   637,   370,    13,   185,   185,    32,
         5005,   438, 21835,   557,   412,   276,   356,   665,   438,  3665,
          404,   331,  5005, 17654,    13,   185,   185,  7671, 14447,   924,
           32,    62,    16,    11,   425,    62,    17,    11,  2335,    63,
          516,   924,    33,    62,    16,    11,   609,    62,    17,    11,
         2335,    63,  1139,  4843,   356,  3379,   438,  3879,   924,    72,
           63,   408,  1982,   924,    32,    62,    72,   910,   609,    62,
           72,  4096,   479,   185,  5719, 

  input_ids = torch.tensor(input_ids),
  attention_masks= torch.tensor(attention_masks),


In [25]:
from transformers import BertModel 
import torch.nn as nn

class BertForSentenceLabeling(BertModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = self.config.num_labels
        self.classifier = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        self.loss = nn.BCEWithLogitsLoss()
    
    def forward(self, labels = None, **kwargs):
        outputs = super().forward(**kwargs)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.classifier(pooled_output)
        pooled_output = torch.sigmoid(pooled_output)
        
        if labels is not None:
            loss = self.loss(pooled_output, labels)
            return (loss, pooled_output)
        else:
            return (pooled_output,)

In [26]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [28]:
def get_optimizer_and_scheduler(model, total_steps, lr = 2e-5, weight_decay = 0.01):
    optimzer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = get_linear_schedule_with_warmup(
        optimzer,
        num_training_steps=int(total_steps * 0.1),
        num_training_steps=total_steps,
    )
    return optimzer, scheduler

SyntaxError: keyword argument repeated: num_training_steps (1984304725.py, line 6)

In [None]:
def get_accuracy(preds, labels):
    # multi-label classification
    preds = torch.round(preds)
    correct = (preds == labels).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train_model(model, epochs, train_dataloader, validation_dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    total_steps = len(train_dataloader) * epochs
    optimizer, scheduler = get_optimizer_and_scheduler(model, total_steps)
    loss_values = []
    eval_accs = []
    
    for epoch in range(0, epochs):
        total_loss = 0.0
        model.train()
        
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()
            loss, _ = model(b_labels, input_ids=b_input_ids, attention_mask=b_input_mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_dataloader)
        loss_values.append(avg_train_loss)
        print(f"Epoch: {epoch}, Train loss: {avg_train_loss}")
        
        print("Validation...")
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():
                loss, logits = model(b_labels, input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = get_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        avg_eval_accuracy = eval_accuracy / nb_eval_steps
        print(f"Validation Accuracy: {avg_eval_accuracy}")
        eval_accs.append(avg_eval_accuracy)
    print("Training complete!")
    return loss_values, eval_accs

In [27]:
my_bert = BertForSentenceLabeling.from_pretrained(
    "bigcode/santacoder",
    num_labels=43,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

You are using a model of type gpt2 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
loss_values, eval_accs = train_model(
    model=my_bert,
    epochs=10,
    train_dataloader=bert_train_dataloader,
    validation_dataloader=bert_validation_dataloader,
)