In [1]:
!pip install -q transformers datasets

In [2]:
import os
import re
import string
import json
import numpy as np
import pandas as pd
import random
from sklearn import metrics, model_selection
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [18]:
df = pd.read_csv("./CPTALGOML.csv")

In [19]:
df["qns"] = df["question_text"] + "\n" + df["input_outputs"]
df = df[["qns", "algo_tags"]]
df.head()

Unnamed: 0,qns,algo_tags
0,This is the hard version of the problem. The o...,['math']
1,This is the simple version of the problem. The...,"['greedy', 'math']"
2,You are given an array a consisting of n zeros...,"['brute force', 'search']"
3,Timofey has an apple tree growing in his garde...,['math']
4,Vanya really likes math. One day when he was s...,['math']


In [22]:
with open("./algo_classes.json") as f:
  algo_mapping = json.load(f)

algo_mapping

{'brute force': 0,
 'dynamic programming': 1,
 'greedy': 2,
 'math': 3,
 'search': 4}

In [28]:
for tag in algo_mapping:
  df[tag] = np.zeros(shape=(df.shape[0], 1), dtype=np.int16)

for i in range(df.shape[0]):
  tags = df.iloc[i, 1].split("'")
  for s in tags:
    if s in algo_mapping:
      df.iat[i, algo_mapping[s]+2] = 1

In [29]:
df['list'] = df[df.columns[2:]].values.tolist()

In [30]:
new_df = df[["qns", "list"]]
new_df.head()

Unnamed: 0,qns,list
0,This is the hard version of the problem. The o...,"[0, 0, 0, 1, 0]"
1,This is the simple version of the problem. The...,"[0, 0, 1, 1, 0]"
2,You are given an array a consisting of n zeros...,"[1, 0, 0, 0, 1]"
3,Timofey has an apple tree growing in his garde...,"[0, 0, 0, 1, 0]"
4,Vanya really likes math. One day when he was s...,"[0, 0, 0, 1, 0]"


In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [32]:
MAX_LEN = 500
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-5
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [33]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.qns
        self.tokenizer = tokenizer
        self.targets = self.df.list

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        qn = str(self.text[index])
        qn = " ".join(qn.split())

        inputs = self.tokenizer.encode_plus(
            qn,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [34]:
train_size = 0.9
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = BERTDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = BERTDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (6477, 2)
TRAIN Dataset: (5829, 2)
TEST Dataset: (648, 2)


In [35]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [43]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 5)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device);

In [44]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [45]:
optimizer = torch.optim.Adam(params =  model.parameters(),
                             lr=LEARNING_RATE)

In [46]:
def train(epoch):
    model.train()
    losses = 0
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        losses += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch {epoch+1} loss {losses/len(training_loader) :.6f}")

In [47]:
for epoch in range(EPOCHS):
    train(epoch)

epoch 1 loss 0.490101
epoch 2 loss 0.429283
epoch 3 loss 0.390183
epoch 4 loss 0.346539
epoch 5 loss 0.295109


In [48]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [49]:
outputs, targets = validation(1)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.36419753086419754
F1 Score (Micro) = 0.5619295958279009
F1 Score (Macro) = 0.48571827046251803
