In [2]:
import os
import re
import string
import json
import numpy as np
import pandas as pd
import random
from sklearn import metrics, model_selection
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [7]:
df = pd.read_csv("../datasets/CPTALGOML.csv")

In [8]:
df["qns"] = df["question_text"] + "\n" + df["input_outputs"]
df = df[["qns", "algo_tags"]]
df.head()

Unnamed: 0,qns,algo_tags
0,This is the hard version of the problem. The o...,['math']
1,This is the simple version of the problem. The...,"['greedy', 'math']"
2,You are given an array a consisting of n zeros...,"['brute force', 'search']"
3,Timofey has an apple tree growing in his garde...,['math']
4,Vanya really likes math. One day when he was s...,['math']


In [9]:
idxs = set(range(df.shape[0]))
train_idx = set(random.sample(idxs, k=int(df.shape[0]*0.9)))
len(train_idx)

5829

In [10]:
df_train = df.iloc[list(train_idx), :]
df_dev = df.iloc[list(idxs.difference(train_idx)), :]

df_train.shape, df_dev.shape

((5829, 2), (648, 2))

In [11]:
with open("../datasets/algo_classes.json") as f:
  algo_mapping = json.load(f)

algo_mapping

{'brute force': 0,
 'dynamic programming': 1,
 'greedy': 2,
 'math': 3,
 'search': 4}

In [12]:
for tag in algo_mapping:
  df_train[tag] = np.zeros(shape=(df_train.shape[0], 1))
  df_dev[tag] = np.zeros(shape=(df_dev.shape[0], 1), dtype=np.int16)

for i in range(df_train.shape[0]):
  tags = df_train.iloc[i, 1].split("'")
  for s in tags:
    if s in algo_mapping:
      df_train.iat[i, algo_mapping[s]+2] = 1

for i in range(df_dev.shape[0]):
  tags = df_dev.iloc[i, 1].split("'")
  for s in tags:
    if s in algo_mapping:
      df_dev.iat[i, algo_mapping[s]+2] = 1

In [13]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

In [14]:
df_train.shape, df_dev.shape

((5829, 7), (648, 7))

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [16]:
MAX_LEN = 500
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 15
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [17]:
target_cols = [col for col in df_train.columns if col not in ['qns', 'algo_tags']]
target_cols

['brute force', 'dynamic programming', 'greedy', 'math', 'search']

In [18]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.qns
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [19]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [20]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE,
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE,
                          num_workers=4, shuffle=False, pin_memory=True)

In [21]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,5)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [23]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [24]:
def train(epoch):
    model.train()
    losses = 0
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        losses += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch: {epoch+1}, loss: {losses/len(train_loader)}")

In [25]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, loss: 0.48877241418521916
Epoch: 2, loss: 0.4308403291738246
Epoch: 3, loss: 0.39892372920271135
Epoch: 4, loss: 0.36331973589742134
Epoch: 5, loss: 0.3259565319830171
Epoch: 6, loss: 0.28945760926632885
Epoch: 7, loss: 0.24123303949403666
Epoch: 8, loss: 0.20383029604338324
Epoch: 9, loss: 0.1690533700640555
Epoch: 10, loss: 0.14063167860999765
Epoch: 11, loss: 0.11876495874043559
Epoch: 12, loss: 0.10206878033373377
Epoch: 13, loss: 0.09342234934947169
Epoch: 14, loss: 0.08373702993100338
Epoch: 15, loss: 0.06523432183642815


In [26]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [27]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.345679012345679
F1 Score (Micro) = 0.5789173789173788
F1 Score (Macro) = 0.5588625738909939
