In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (Dataset,
                              DataLoader,
                              RandomSampler,
                              SequentialSampler,
                              TensorDataset)
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

from transformers import AdamW
from transformers import (get_scheduler,
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_linear_schedule_with_warmup)
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import (accuracy_score,
                             precision_recall_curve,
                             f1_score,
                             auc)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
import math

from datasets import load_metric, load_dataset, Dataset, concatenate_datasets


In [2]:
from transformers import (AutoConfig,
                          AutoTokenizer,
                          RobertaForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback)

In [3]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed_everything(42)

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    label_indices = list(range(3))
    f1 = f1_score(labels, preds, average="micro", labels=label_indices) * 100.0
    return {'micro f1 score': f1}

In [6]:
# k-fold를 위해 나누어져있는 dataset을 다시 합쳤습니다.
# 기존 코드를 사용하여 데이터셋 로드 및 결합
train_dset = load_dataset("csv", data_files="../data/train_data_lv1.csv")['train']
validation_dset = load_dataset("csv", data_files="../data/valid_data_lv1.csv")['train']
rawdataset = concatenate_datasets([train_dset, validation_dset])

# 전체 데이터셋의 10%만을 유지하기 위해 train_test_split 사용
rawdataset = rawdataset.train_test_split(test_size=0.1, seed=42)['test']
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

# Tokenize
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=512, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs
dset = rawdataset.map(example_fn, remove_columns=train_dset.column_names)

Map:   0%|          | 0/9013 [00:00<?, ? examples/s]

In [7]:
def train(args):

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    config =  AutoConfig.from_pretrained("microsoft/graphcodebert-base")
    config.num_labels = 2


    gap = int(len(dset) / args.k_fold)

    for i in range(args.k_fold):

        model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", config=config).to(device)

        print('\n%dth Training' %(i+1))

        output_dir = args.output_dir + '_' + str(i+1)
        logging_dir = args.logging_dir + '_' + str(i+1)

        # trainingset, validset 구성
        total_size = len(dset)
        total_ids = list(range(total_size))
        del_ids = list(range(i*gap, (i+1)*gap))
        training_ids = set(total_ids) - set(del_ids)

        training_dset = dset.select(list(training_ids))
        eval_dset = dset.select(del_ids)

        # Training Arguments -> Graphcodebert 깃허브를 참고하여 설정했습니다.
        args.max_steps=args.epochs*len(dset)
        args.save_steps=len(dset)//10
        args.warmup_steps = args.max_steps//5


        training_args = TrainingArguments(
            output_dir=args.output_dir,                         # output directory
            overwrite_output_dir=True,                          # overwrite output directory
            save_total_limit=5,                                 # number of total save model.
            save_steps=args.save_steps,                         # model saving step.
            num_train_epochs=args.epochs,                       # total number of training epochs
            learning_rate=args.lr,                              # learning_rate
            per_device_train_batch_size=args.train_batch_size,  # batch size per device during training
            per_device_eval_batch_size=args.eval_batch_size,    # batch size for evaluation
            warmup_steps=args.warmup_steps,                     # number of warmup steps for learning rate scheduler
            weight_decay=args.weight_decay,                     # strength of weight decay
            logging_dir=args.logging_dir,                       # directory for storing logs
            logging_steps=args.logging_steps,                   # log saving step.
            evaluation_strategy=args.evaluation_strategy,       # evaluation strategy to adopt during training
            eval_steps=args.eval_steps,                         # evaluation step.
            load_best_model_at_end = True, # for earlystopping
            save_strategy = 'steps', # for earlystopping
            logging_strategy = 'steps', # for earlystopping
            gradient_accumulation_steps=args.gradient_accumulation_steps,
        )

        collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)

        trainer = Trainer(
            model=model,                         # the instantiated Transformers model to be trained
            args=training_args,                  # training arguments, defined above
            train_dataset=training_dset,            # training dataset
            eval_dataset=eval_dset,        # evaluation dataset
            data_collator=collator,              # collator
            compute_metrics=compute_metrics,      # define metrics function -> micro f1
            callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
        )

        # -- Training
        print('Training Strats')
        trainer.train()


In [10]:
import easydict
args = easydict.EasyDict({
    'output_dir': './DACON',
    'logging_dir': './DACON',
    'lr': 2e-5,
    'epochs': 3,
    'train_batch_size': 4,
    'weight_decay': 0.0,
    'warmup_steps': 0,
    'gradient_accumulation_steps':2,
    'eval_batch_size': 8,
    'k_fold':5,
    'evaluation_strategy': 'steps',
    'save_steps': 500,
    'logging_steps': 1000,
    'eval_steps':901,
    'max_steps':-1
})

In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()

train(args)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



1th Training
Training Strats


Step,Training Loss,Validation Loss


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



2th Training
Training Strats


Step,Training Loss,Validation Loss


Checkpoint destination directory ./DACON\checkpoint-901 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-1802 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-2703 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



3th Training
Training Strats


Step,Training Loss,Validation Loss


Checkpoint destination directory ./DACON\checkpoint-901 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-1802 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-2703 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



4th Training
Training Strats


Step,Training Loss,Validation Loss


Checkpoint destination directory ./DACON\checkpoint-901 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-1802 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-2703 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



5th Training
Training Strats


Step,Training Loss,Validation Loss


Checkpoint destination directory ./DACON\checkpoint-901 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-1802 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./DACON\checkpoint-2703 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [75]:
def preprocess_script(code):
    # code1과 code2에 대한 처리를 함수로 간소화
    def preprocess_code(source_code):
        new_code = deque()
        block_comment = False  # 블록 주석 처리를 위한 플래그
        for line in source_code.split('\n'):
            # 블록 주석 시작
            if '/*' in line:
                block_comment = True
                line = line[:line.index('/*')]
            # 블록 주석 끝
            if '*/' in line:
                block_comment = False
                line = line[line.index('*/')+2:]
            if block_comment or line.strip().startswith('//'):
                continue  # 블록 주석 중이거나 한 줄 주석이면 건너뜀

            line = line.rstrip()
            # 한 줄 주석 처리
            if '//' in line:
                line = line[:line.index('//')]
            line = line.replace('\n', '')  # 개행 문자 삭제
            line = line.replace('    ', '\t')  # 공백 4칸을 탭으로 변환

            if line == '':  # 전처리 후 빈 라인은 건너뜀
                continue

            new_code.append(line)
 
    return code


def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=512, truncation=True)
    return outputs

In [76]:
testdataset = load_dataset("csv", data_files='../data/test.csv')['train']

In [77]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

In [94]:
from collections import deque
import re
preprocessed = testdataset.map(preprocess_script)
test_dataset = preprocessed.map(example_fn, remove_columns=['code1', 'code2','pair_id'])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/595000 [00:00<?, ? examples/s]

In [95]:
testloader = DataLoader(test_dataset,
                        batch_size=16,
                        shuffle=False,
                        collate_fn = collator
                        )

In [116]:
first_batch = next(iter(testloader))

In [120]:
first_batch['input_ids'].shape

torch.Size([16, 512])

In [28]:
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
load_path = f'./DACON/checkpoint-2703/optimizer.pt'
# model.load_state_dict(torch.load(load_path,map_location=torch.device('cpu')))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [97]:
model.eval()
progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
for i, data in progress_bar:
    with torch.no_grad():
        logits = model(
            data['input_ids'].to(device),
            data['attention_mask'].to(device),
        )
        logits=logits.logits
    if i==0:
        one_fold_logits = logits
    else:
        one_fold_logits = torch.cat([one_fold_logits,logits],dim=0)

# torch tensor를 저장하기 위한 numpy 변환
one_fold_logits = one_fold_logits.squeeze(0).detach().cpu().numpy()

  0%|          | 0/37188 [00:00<?, ?it/s]

In [104]:
a = np.argmax(one_fold_logits,axis=1)

In [112]:
one_fold_logits[0:100]

array([[-0.08194216,  0.08257993],
       [-0.04088554, -0.00806557],
       [-0.08088842,  0.03998123],
       [-0.09444278,  0.0371286 ],
       [-0.05098515,  0.02198036],
       [-0.04448672,  0.05920207],
       [-0.10218832,  0.00994879],
       [-0.06491666, -0.01072896],
       [-0.02801997,  0.05661651],
       [-0.10346632,  0.05109083],
       [-0.04204335, -0.02677844],
       [-0.05192484,  0.01970526],
       [-0.02836405,  0.06451653],
       [-0.12566161,  0.07503513],
       [-0.09584039,  0.01191681],
       [-0.05152438, -0.03284322],
       [-0.05662401,  0.04437123],
       [-0.0765712 ,  0.10035465],
       [-0.07291934,  0.08909668],
       [-0.05363311, -0.0034636 ],
       [-0.07749471,  0.00828263],
       [-0.06051099,  0.01428877],
       [-0.04158355,  0.05776763],
       [-0.09222958,  0.03402672],
       [-0.0589044 ,  0.06194304],
       [-0.03134386,  0.13994384],
       [-0.06475163,  0.04282352],
       [-0.05242182,  0.03854364],
       [-0.08807448,

In [123]:
submission = pd.read_csv('../data/sample_submission.csv')

In [124]:
submission['similar'] = a

In [125]:
submission

Unnamed: 0,pair_id,similar
0,TEST_000000,1
1,TEST_000001,0
2,TEST_000002,1
3,TEST_000003,1
4,TEST_000004,1
...,...,...
594995,TEST_594995,1
594996,TEST_594996,1
594997,TEST_594997,1
594998,TEST_594998,1


In [126]:
submission.to_csv('../submission/codegraph_test.csv', index=False)