In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from tqdm import trange
from sklearn.model_selection import train_test_split
from torch.utils.data import dataloader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
df=pd.read_csv("../data/atcoder_problem_tag_dataset.csv")

In [5]:
df=df[df["problem_texts"].notna()].reset_index()
df["concatenate_texts"]=df["problem_texts"]+df["constraints"]+df["input_texts"]+df["output_texts"]
le = LabelEncoder()
df["tag"] = le.fit_transform(df["tag"])

In [6]:
df_copy = df.copy()
df_train = df_copy.sample(frac=0.8, random_state=0)
df_eval = df_copy.drop(df_train.index).reset_index()
df_train = df_train.reset_index()

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict=True,num_labels=14)
#tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
#model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', return_dict=True,num_labels=14)
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [8]:
class DataSet:
    def __init__(self,df,tokenizer,input_col_name,output_col_name):
        self.X = tokenizer.batch_encode_plus(df["concatenate_texts"].tolist(),truncation=True,max_length=512,pad_to_max_length=True, add_special_tokens=True,return_tensors='pt')
        self.y = torch.tensor(df[output_col_name])
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self,index):
        return self.X["input_ids"][index].to(device),self.X["token_type_ids"][index].to(device), self.X["attention_mask"][index].to(device), self.y[index].to(device)

In [9]:
dataset_train = DataSet(df_train,tokenizer,"concatenate_texts","tag")
dataset_eval = DataSet(df_train,tokenizer,"concatenate_texts","tag")
trainset = dataloader.DataLoader(dataset = dataset_train, shuffle=True, batch_size = 8)
evalset = dataloader.DataLoader(dataset = dataset_eval, shuffle=True, batch_size = 8)



In [None]:
for epoch in trange(1,51):
    model.train()
    total_loss_train=0
    for batch in trainset:
        optimizer.zero_grad()
        input_ids, input_token_type_ids, input_attention_mask ,labels= batch 
        output = model(input_ids,token_type_ids = input_token_type_ids,attention_mask =input_attention_mask, labels=labels)
        loss = output[0]
        loss.backward()
        optimizer.step()
        model.zero_grad()
        total_loss_train+=loss
    print("epoch",epoch)
    print("    Avg train loss per sample:",total_loss_train.item()/len(trainset))
    model.eval()
    total_loss_eval = 0
    for batch in evalset:
        with torch.no_grad():
            input_ids, input_token_type_ids, input_attention_mask ,labels= batch
            output_eval = model(input_ids,token_type_ids = input_token_type_ids,attention_mask =input_attention_mask, labels=labels)
            loss = output_eval[0]
            total_loss_eval+=loss
    print("    Avg eval loss per sample:",total_loss_eval.item()/len(evalset))

epoch 1
    Avg train loss per sample: 0.01605034161763019
epoch 1
    Avg train loss per sample: 0.03324786438999406
epoch 1
    Avg train loss per sample: 0.04970192621989423
epoch 1
    Avg train loss per sample: 0.06664139391428017
epoch 1
    Avg train loss per sample: 0.08219859686242528
epoch 1
    Avg train loss per sample: 0.09840567715196724
epoch 1
    Avg train loss per sample: 0.11551318111189876
epoch 1
    Avg train loss per sample: 0.1312180714434888
epoch 1
    Avg train loss per sample: 0.14698680050401802
epoch 1
    Avg train loss per sample: 0.1622063165687653
epoch 1
    Avg train loss per sample: 0.18003878535994564
epoch 1
    Avg train loss per sample: 0.19517783659050264
epoch 1
    Avg train loss per sample: 0.2114365818988846
epoch 1
    Avg train loss per sample: 0.22818046018301721
epoch 1
    Avg train loss per sample: 0.2439439153096762
epoch 1
    Avg train loss per sample: 0.25854816206966535
epoch 1
    Avg train loss per sample: 0.2738305287188794
ep

In [None]:
for batch in testset:
    input_ids, input_token_type_ids, input_attention_mask ,labels= batch 
    output = model(input_ids,token_type_ids = input_token_type_ids,attention_mask =input_attention_mask, labels=labels)
    pred_label = torch.argmax(output[1],axis=0)