In [2]:
!pip install --upgrade transformers



In [4]:
import os
import random

import numpy as np
import torch

SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [3]:
import pandas as pd

df = pd.read_csv("../data/leetcode.csv")

df.sample(10)

Unnamed: 0,description,Array,Dynamic Programming,String,Math,Tree,Depth-first Search,Greedy,Hash Table,Binary Search,...,Random,Dequeue,Binary Search Tree,Suffix Array,Rolling Hash,Reservoir Sampling,Rejection Sampling,Memoization,OOP,Meet in the Middle,Unnamed: 22,Unnamed: 23
802,An array is monotonic if it is either monotone...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,,
1547,There is an authentication system that works w...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,,
467,"Given the `root` of a binary tree, return the ...",0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,,
59,"The set `[1, 2, 3, ..., n]` contains a total o...",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,,
52,"Given an integer array `nums`, find the contig...",1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
543,"Given a characters array `tasks`, representing...",1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0,
25,"Given a sorted array nums, remove the duplicat...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,,
574,You are given an integer array `nums` with no ...,0,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0.0,0.0
1264,"Given a binary tree `root`, a node X in the tr...",0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0,
1343,Given a string `s` of lower and upper case Eng...,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0,


In [5]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

test_input = df.iloc[0]['description']
# print(test_input)
test_tokens = tokenizer.tokenize(test_input)
test_ids = tokenizer.encode(test_input, add_special_tokens=True, return_tensors='pt')
print(test_tokens)
print(test_ids)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['given', 'an', 'array', 'of', 'integers', '`', 'nu', '##ms', '`', 'and', 'an', 'integer', '`', 'target', '`', ',', 'return', 'indices', 'of', 'the', 'two', 'numbers', 'such', 'that', 'they', 'add', 'up', 'to', '`', 'target', '`', '.', 'you', 'may', 'assume', 'that', 'each', 'input', 'would', 'have', 'exactly', 'one', 'solution', ',', 'and', 'you', 'may', 'not', 'use', 'the', 'same', 'element', 'twice', '.', 'you', 'can', 'return', 'the', 'answer', 'in', 'any', 'order', '.', 'example', '1', ':', 'input', ':', 'nu', '##ms', '=', '[', '2', ',', '7', ',', '11', ',', '15', ']', ',', 'target', '=', '9', 'output', ':', '[', '0', ',', '1', ']', 'output', ':', 'because', 'nu', '##ms', '[', '0', ']', '+', 'nu', '##ms', '[', '1', ']', '=', '=', '9', ',', 'we', 'return', '[', '0', ',', '1', ']', '.', 'example', '2', ':', 'input', ':', 'nu', '##ms', '=', '[', '3', ',', '2', ',', '4', ']', ',', 'target', '=', '6', 'output', ':', '[', '1', ',', '2', ']', 'example', '3', ':', 'input', ':', 'nu', '##m

In [7]:
def split_data(df):
    texts = df['description'].values
    lables = df.iloc[:, 1:].values
    # convert texts to list of strings
    texts = [str(text) for text in texts]

    # do train val test split
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, lables, test_size=0.2, random_state=SEED)
    val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=SEED)
    return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels

In [9]:
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = split_data(df)
num_topics = len(train_labels[0])

In [10]:
class my_bert(BertModel):
    def __init__(self, config):
        super().__init__(config)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_topics),
        )
        self.activation = torch.nn.Sigmoid()
        self.loss_fn = torch.nn.BCELoss()
    
    def forward(self, labels = None, **kwargs):
        outputs = super().forward(**kwargs)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        probs = self.activation(logits)
        if labels is not None:
            loss = self.loss_fn(probs, labels)
            return loss, probs
        else:
            return (probs,)

In [11]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels, dtype=torch.float32))
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels, dtype=torch.float32))
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels, dtype=torch.float32))

In [12]:
def train(batch_size = 16, num_epochs = 10):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = BertModel.from_pretrained('bert-base-uncased')
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            loss, _ = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(input_ids)
        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                loss, probs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_preds.append(probs.cpu().numpy())
        val_loss /= len(val_loader.dataset)
        val_preds = np.vstack(val_preds)
        
        val_labels = val_labels.astype(int)
        val_preds = (val_preds > 0.5).astype(int)
        accuracy = np.mean(np.sum(val_labels == val_preds, axis=1) == num_topics)
        f1_score = f1_score(val_labels, val_preds, average='macro')
        print(f'Epoch {epoch + 1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Accuracy: {accuracy:.4f} | F1 Score: {f1_score:.4f}')


KeyboardInterrupt: 