In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from tqdm import trange,tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import dataloader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
df=pd.read_csv("../data/atcoder_problem_tag_dataset.csv")

In [4]:
df=df[df["problem_texts"].notna()].reset_index()
df["concatenate_texts"]=df["problem_texts"]+df["constraints"]+df["input_texts"]+df["output_texts"]
le = LabelEncoder()
df["tag"] = le.fit_transform(df["tag"])

In [5]:
df_copy = df.copy()
df_train = df_copy.sample(frac=0.8, random_state=0)
df_eval = df_copy.drop(df_train.index).reset_index()
df_train = df_train.reset_index()

In [6]:
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('../models/albert_atcoder/',return_dict=True,num_labels=14)
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

In [7]:
class DataSet:
    def __init__(self,df,tokenizer,input_col_name,output_col_name):
        self.X = tokenizer.batch_encode_plus(df["concatenate_texts"].tolist(),truncation=True,max_length=512,pad_to_max_length=True, add_special_tokens=True,return_tensors='pt')
        self.y = torch.tensor(df[output_col_name])
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self,index):
        return self.X["input_ids"][index].to(device),self.X["token_type_ids"][index].to(device), self.X["attention_mask"][index].to(device), self.y[index].to(device)

In [8]:
dataset_eval = DataSet(df_eval,tokenizer,"concatenate_texts","tag")
evalset = dataloader.DataLoader(dataset = dataset_eval, shuffle=False, batch_size = 20)



In [9]:
preds=[]
trues=[]
model.eval()
for batch in tqdm(evalset):
    with torch.no_grad():
        input_ids, input_token_type_ids, input_attention_mask ,labels= batch
        output_eval = model(input_ids,token_type_ids = input_token_type_ids,attention_mask =input_attention_mask, labels=labels)
        preds +=torch.argmax(output_eval[1],axis=1).tolist()
        trues +=labels.tolist()

100%|██████████| 17/17 [06:31<00:00, 23.03s/it]


In [10]:
cum=0
for i,j in zip(preds,trues):
    cum+=(i==j)
print(cum/len(preds))

0.33636363636363636
