In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import transformers
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from setproctitle import setproctitle
from sklearn.model_selection import train_test_split
from itertools import combinations
from rank_bm25 import BM25L

import torch
import torch.nn as nn
import random
import time
import datetime
import numpy as np
import pandas as pd
import os, re
import argparse

In [2]:


''' 아무 내용이 없는 줄은 제거합니다. '''
def get_rid_of_empty(c):
    ret = []
    splitted = c.split('\n')
    for s in splitted:
        if len(s.strip()) > 0:
            ret.append(s)
    return '\n'.join(ret)


def clean_data(script, data_type="dir"):
    preproc_lines = []

    if data_type == "dir":
        with open(script, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    else:
        lines = script.split('\n')

    block_comment = False  # 블록 주석이 시작되었는지 여부를 추적
    for line in lines:
        line = line.rstrip()  # 오른쪽 공백 제거

        # 블록 주석 시작 처리
        if '/*' in line:
            block_comment = True
            line = line.split('/*', 1)[0]  # '/*' 이전 내용만 유지

        # 블록 주석 내용은 모두 무시
        if block_comment:
            if '*/' in line:  # 블록 주석 종료 확인
                line = line.split('*/', 1)[1]  # '*/' 이후 내용만 유지
                block_comment = False
            else:
                continue

        # 한 줄 주석 처리
        if '//' in line:
            line = line.split('//', 1)[0]

        # 불필요한 공백과 탭 정리
        line = line.replace('\n', '')
        line = line.replace('    ', '\t')  # 4개의 공백을 탭으로 변환

        if line == '':  # 빈 라인 무시
            continue

        preproc_lines.append(line)

    preprocessed_script = '\n'.join(preproc_lines)

    return preprocessed_script



''' positive, negative 페어 생성 함수 '''
def get_pairs(input_df, tokenizer):
    codes = input_df['code'].to_list()
    problems = input_df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25L(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = input_df[input_df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(input_df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)
    return pair_data

def data_preprocess(args):
    """ Data preprocess
    train, valid, test에 대한 전처리를 수행하고, 이 과정에서 결과적으로 아래의 파일명이 생성됩니다."""

    """
    dacon_train_data = pd.read_csv("./data/" + "new_dataset_0607/graph_dacon_train_bm25L.csv")
    dacon_valid_data = pd.read_csv("./data/" + "new_dataset_0607/graph_dacon_valid_bm25L.csv")
    codenet_train_data = pd.read_csv("./data/" + "new_dataset_0607/graph_codenet_train_bm25L.csv")
    codenet_valid_data = pd.read_csv("./data/" + "new_dataset_0607/graph_codenet_valid_bm25L.csv")
    test_data = pd.read_csv("./data/new_dataset_0604/processed_test.csv")
    """

    # 데이콘이 제공해준 학습 코드 데이터 데이터프레임 만들기
    code_folder = "../data/train_code"  # 데이콘이 제공해준 학습 데이터 파일의 경로
    problem_folders = os.listdir(code_folder)
    preproc_scripts = []
    problem_nums = []

    for problem_folder in tqdm(problem_folders):
        scripts = os.listdir(os.path.join(code_folder, problem_folder))
        problem_num = scripts[0].split('_')[0]
        for script in scripts:
            script_file = os.path.join(code_folder, problem_folder, script)
            preprocessed_script = clean_data(script_file, data_type="dir")
            preproc_scripts.append(preprocessed_script)
        problem_nums.extend([problem_num] * len(scripts))
    train_df = pd.DataFrame(data={'code': preproc_scripts, 'problem_num': problem_nums})

    # 데이콘이 제공해준 테스트 코드 데이터 데이터프레임 만들기
    test_df = pd.read_csv("../data/test.csv")
    code1 = test_df['code1'].values
    code2 = test_df['code2'].values
    processed_code1 = []
    processed_code2 = []
    for i in tqdm(range(len(code1))):
        processed_c1 = clean_data(code1[i], data_type="file")
        processed_c2 = clean_data(code2[i], data_type="file")
        processed_code1.append(processed_c1)
        processed_code2.append(processed_c2)
    processed_test = pd.DataFrame(list(zip(processed_code1, processed_code2)), columns=["code1", "code2"])

    # IBM의 CodeNet으로 추가 코드 학습/검증 데이터 데이터프레임 만들기
    code_folder = "Project_CodeNet_Python800"  # CodeNet 데이터 경로
    problem_folders = os.listdir(code_folder)
    preproc_scripts = []
    problem_nums = []
    for problem_folder in tqdm(problem_folders):
        scripts = os.listdir(os.path.join(code_folder, problem_folder))
        problem_num = int(problem_folder.split('p')[1])
        problem_num = 'problem' + str(problem_num)
        for script in scripts:
            script_file = os.path.join(code_folder, problem_folder, script)
            preprocessed_script = clean_data(script_file)
            preproc_scripts.append(preprocessed_script)
        problem_nums.extend([problem_num] * len(scripts))
    codenet_df = pd.DataFrame(data={'code': preproc_scripts, 'problem_num': problem_nums})


    """ 추가 codenet 데이터에서 테스트셋과 겹치는 데이터가 있다는걸 관찰하였고, 이를 위해 3단계에 걸쳐 필터링 작업을 진행합니다.

    [1차 필터링] : codenet_df에서 test_df의 데이터와 겹치는 녀석들을 set (hash table) 으로 대부분 필터링해줍니다.
    [2차 필터링] : 1차 필터링 과정에서 trailing space 등의 이유로 set 방법으로 완전하게 걸러지지 않은 데이터를 걸러주는 것이 목적입니다. 이를 위해 코드 문자열에 존재하는 newline들을 전부 이어붙이고, 앞 뒤로 존재하는 공백 및 newline을 제거합니다. 이렇게 전처리된 test code 와 codenet code 문자열들을 각각 set에 넣어 intersection 함으로써 한번 더 걸러주는 작업을 수행합니다.
    [3차 필터링] : 2차 필터링 과정 이후 test set에 포함된 데이터가 거의 다 제거되었습니다. 그러나 완벽히 제거하기 위해 exhaustive search를 통해 최종적으로 남아있는 test셋의 흔적들을 제거합니다.
    """

    # [1차 필터링]
    dacon_codes = np.concatenate([train_df['code'].values,
                                  test_df['code1'].values,
                                  test_df['code2'].values]
                                 )
    dacon_codes_set = set()

    for i in tqdm(range(len(dacon_codes))):
        dacon_codes_set.add(dacon_codes[i])
    usable_codes = []
    usable_problem_nums = []
    codenet_codes = codenet_df['code'].values
    problem_nums = codenet_df['problem_num'].values

    for i in tqdm(range(len(codenet_codes))):
        if codenet_codes[i] not in dacon_codes_set:
            usable_codes.append(codenet_codes[i])
            usable_problem_nums.append(problem_nums[i])
    filtered_codenet_df = pd.DataFrame(data={'code': usable_codes,
                                             'problem_num': usable_problem_nums})

    # 리소스 문제로, 완성된 filtered_codenet_df 중 50%의 데이터만을 이용해서 학습에 사용합니다.
    filtered_codenet_df = filtered_codenet_df.sample(frac=0.5, random_state=42)


    # [2차 필터링]
    def simplify(x):
        return ''.join(x.split('\n')).rstrip(' ').strip()

    codenet_codes = filtered_codenet_df['code'].values
    codenet_problem_nums = filtered_codenet_df['problem_num'].values
    test_codes1 = test_df['code1'].values
    test_codes2 = test_df['code2'].values

    test_codes = np.concatenate([test_codes1, test_codes2])

    codenet_set = set()

    for i in tqdm(range(len(codenet_codes))):
        codenet_set.add(simplify(codenet_codes[i]))
    test_set = set()
    for i in tqdm(range(len(test_codes))):
        test_set.add(simplify(test_codes[i]))
    intersection = codenet_set.intersection(test_set)
    usable_codenet_filtered, usable_codenet_filtered_problems = [], []
    for i in tqdm(range(len(codenet_codes))):
        if simplify(codenet_codes[i]) not in intersection:
            usable_codenet_filtered.append(codenet_codes[i])
            usable_codenet_filtered_problems.append(codenet_problem_nums[i])
    filtered_codenet_df = pd.DataFrame(data={'code': usable_codenet_filtered,
                                             'problem_num': usable_codenet_filtered_problems})


    # [3차 필터링]
    codenet_codes = filtered_codenet_df['code'].values
    problem_nums = filtered_codenet_df['problem_num'].values
    usable_codenet_filtered, usable_codenet_filtered_problems = [], []
    for i in tqdm(range(len(codenet_codes)), position=0, leave=True):
        usable = True
        if codenet_codes[i] in test_set:
            continue
        else:
            for s in test_set:
                if len(s) > 0 and len(codenet_codes[i]) > 0 and ((codenet_codes[i] in s) or (s in codenet_codes[i])):
                    usable = False
                    break
        if usable == True:
            usable_codenet_filtered.append(codenet_codes[i])
            usable_codenet_filtered_problems.append(problem_nums[i])

    filtered_codenet_df = pd.DataFrame(data={'code': usable_codenet_filtered,
                                             'problem_num': usable_codenet_filtered_problems})


    # 데이터 프레임을 만들었으니 이제 train/val split을 진행하고, positive, negative pairs를 생성합니다.
    # 청소님의 코드를 참고해서 hard negative pair를 생성하였으며, BM25대신 BM25L을 사용합니다.
    # (BM25, BM25L 모두 테스트한 결과 BM25L에서 더 좋은 성능을 보였습니다.)
    # tokenizer는 왼쪽부터 truncation을 진행하여 truncation이 필요할때는 코드의 끝 부분들을 이용하게 만듭니다.

    dacon_train_df, dacon_valid_df, dacon_train_label, dacon_valid_label = train_test_split(
        train_df,
        train_df['problem_num'],
        random_state=args.seed,
        test_size=0.1,
        stratify=filtered_codenet_df['problem_num']
    )

    dacon_train_df = dacon_train_df.reset_index(drop=True)
    dacon_valid_df = dacon_valid_df.reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
    tokenizer.truncation_side = 'left'

    dacon_train_bm25L = get_pairs(dacon_train_df, tokenizer)
    dacon_valid_bm25L = get_pairs(dacon_valid_df, tokenizer)

    # 생성된 데이터를 저장합니다. => 이 과정까지의 생성 시간이 꽤 오래걸립니다.
    dacon_train_bm25L.to_csv("./data/" + "new_dataset_0607/graph_dacon_train_bm25L.csv", index=False)
    dacon_valid_bm25L.to_csv("./data/" + "new_dataset_0607/graph_dacon_valid_bm25L.csv", index=False)
    processed_test.to_csv("./data/new_dataset_0604/processed_test.csv", index=False)

    codenet_train_df, codenet_valid_df, codenet_train_label, codenet_valid_label = train_test_split(
        filtered_codenet_df,
        filtered_codenet_df['problem_num'],
        random_state=args.seed,
        test_size=0.1,
        stratify=filtered_codenet_df['problem_num']
    )
    codenet_train_df = codenet_train_df.reset_index(drop=True)
    codenet_valid_df = codenet_valid_df.reset_index(drop=True)

    codenet_train_bm25L = get_pairs(codenet_train_df, tokenizer)
    codenet_valid_bm25L = get_pairs(codenet_valid_df, tokenizer)

    # 생성된 데이터를 저장합니다.
    codenet_train_bm25L.to_csv("./data/" + "new_dataset_0607/graph_codenet_train_bm25L.csv",
                               index=False)
    codenet_valid_bm25L.to_csv("./data/" + "new_dataset_0607/graph_codenet_valid_bm25L.csv",
                               index=False)


def set_seed(args):
    random.seed(args.seed)
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)



def train_model(args):

    """
    전처리된 데이터 호출 및 모델을 학습시키는 과정입니다.

    순서는 다음과 같습니다.

    [1. 전처리 과정에서 생성된 데이터 호출]
    1-1. dacon - train: graph_dacon_train_bm25L.csv
    1-2, dacon - valid: graph_dacon_valid_bm25L.csv
    2-1. codenet - train: graph_codenet_train_bm25L.csv
    2-2. codenet - valid: graph_codenet_valid_bm25L.csv

    [2. 텐서 생성]
    위 데이터를 호출하여 하나의 train_data, valid_data 로 만든 후 텐서를 생성합니다.

    - 이 과정까지 각 모델에 대해 전부 진행되기 때문에, 다소 시간이 걸릴 수 있는 점 참고부탁드립니다.
    - 파일명: [모델명]_mixed_[train or valid]_[input_ids or attention_masks]_[BM25 or BM25L].pt (ex:) _mixed_train_input_ids_BM25L_0608.pt)

    :param args:
        --seed 42
        --learning_rate 2e-5
        --eps 1e-5
        --epochs 3
        --batch_size 32
        --test_batch_size 1048
        --save_tensor True
        --mode train
        --dir_path graphcodebert
        --model_name graphcodebert
        --process_name code_similarity
        --checkpoint_path microsoft/graphcodebert-base
    :return: X
    """

    set_seed(args)
    setproctitle(args.process_name)

    dacon_train_data = pd.read_csv("./data/" + "new_dataset_0607/graph_dacon_train_bm25L.csv")
    dacon_valid_data = pd.read_csv("./data/" + "new_dataset_0607/graph_dacon_valid_bm25L.csv")

    codenet_train_data = pd.read_csv("./data/" + "new_dataset_0607/graph_codenet_train_bm25L.csv")
    codenet_valid_data = pd.read_csv("./data/" + "new_dataset_0607/graph_codenet_valid_bm25L.csv")

    train_data = pd.concat([dacon_train_data, codenet_train_data], axis=0)
    valid_data = pd.concat([dacon_valid_data, codenet_valid_data], axis=0)

    # training
    c1 = train_data['code1'].values
    c2 = train_data['code2'].values
    similar = train_data['similar'].values

    N = train_data.shape[0]
    MAX_LEN = 512

    input_ids = np.zeros((N, MAX_LEN), dtype=int)
    attention_masks = np.zeros((N, MAX_LEN), dtype=int)
    labels = np.zeros((N), dtype=int)

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)

    for i in tqdm(range(N), position=0, leave=True):
        try:
            cur_c1 = str(c1[i])
            cur_c2 = str(c2[i])
            encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                      truncation=True)
            input_ids[i,] = encoded_input['input_ids']
            attention_masks[i,] = encoded_input['attention_mask']
            labels[i] = similar[i]
        except Exception as e:
            print(e)
            pass


    # validating
    c1 = valid_data['code1'].values
    c2 = valid_data['code2'].values
    similar = valid_data['similar'].values

    N = valid_data.shape[0]

    MAX_LEN = 512

    valid_input_ids = np.zeros((N, MAX_LEN), dtype=int)
    valid_attention_masks = np.zeros((N, MAX_LEN), dtype=int)
    valid_labels = np.zeros((N), dtype=int)

    for i in tqdm(range(N), position=0, leave=True):
        try:
            cur_c1 = str(c1[i])
            cur_c2 = str(c2[i])
            encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                      truncation=True)
            valid_input_ids[i,] = encoded_input['input_ids']
            valid_attention_masks[i,] = encoded_input['attention_mask']
            valid_labels[i] = similar[i]
        except Exception as e:
            print(e)
            pass

    if os.path.exists(args.dir_path):
        os.makedirs(args.dir_path, exist_ok=True)

    print("\n\nMake tensor\n\n")
    input_ids = torch.tensor(input_ids, dtype=int)
    attention_masks = torch.tensor(attention_masks, dtype=int)
    labels = torch.tensor(labels, dtype=int)

    valid_input_ids = torch.tensor(valid_input_ids, dtype=int)
    valid_attention_masks = torch.tensor(valid_attention_masks, dtype=int)
    valid_labels = torch.tensor(valid_labels, dtype=int)


    if args.save_tensor == True:
        torch.save(input_ids, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_input_ids_BM25L_0608.pt')
        torch.save(attention_masks, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_attention_masks_BM25L_0608.pt')
        torch.save(labels, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_labels_BM25L_0608.pt')

        torch.save(valid_input_ids, "./data/" + args.dir_path + "/" + args.model_name + "_mixed_valid_input_ids_BM25L_0608.pt")
        torch.save(valid_attention_masks, "./data/" + args.dir_path + "/" + args.model_name + "mixed_valid_attention_masks_BM25L_0608.pt")
        torch.save(valid_labels, "./data/" + args.dir_path + "/" + args.model_name + "mixed_valid_labels_BM25L_0608.pt")


    # Setup training
    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    def format_time(elapsed):
        elapsed_rounded = int(round((elapsed)))
        return str(datetime.timedelta(seconds=elapsed_rounded))

    train_data = TensorDataset(input_ids, attention_masks, labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)

    validation_data = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=args.batch_size)

    model = AutoModelForSequenceClassification.from_pretrained(args.checkpoint_path)
    model.cuda()

    optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=1e-5)  # 아직 이게 정확하지 않음

    total_steps = len(train_dataloader) * args.epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    device = torch.device("cuda")
    loss_f = nn.CrossEntropyLoss()

    # Train
    train_losses, train_accuracies = [], []
    val_losses, val_accuracies = [], []
    model.zero_grad()
    for i in range(args.epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(i + 1, args.epochs))
        print('Training...')
        t0 = time.time()
        train_loss, train_accuracy = 0, 0
        model.train()
        for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", smoothing=0.05):
            if step % 10000 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
                print('  current average loss = {}'.format(
                    train_loss / step))  # bot.sendMessage(chat_id=chat_id, text = '  current average loss = {}'.format(train_loss / step))

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            train_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.detach().cpu().numpy()
            train_accuracy += flat_accuracy(logits, label_ids)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        avg_train_loss = train_loss / len(train_dataloader)
        avg_train_accuracy = train_accuracy / len(train_dataloader)
        train_losses.append(avg_train_loss)
        train_accuracies.append(avg_train_accuracy)
        print("  Average training loss: {0:.8f}".format(avg_train_loss))
        print("  Average training accuracy: {0:.8f}".format(avg_train_accuracy))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

        print("")
        print("Validating...")
        t0 = time.time()
        model.eval()
        val_loss, val_accuracy = 0, 0
        for step, batch in tqdm(enumerate(validation_dataloader), desc="Iteration", smoothing=0.05):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask)

            logits = outputs[0]
            logits = logits.detach().cpu()
            label_ids = b_labels.detach().cpu()
            val_loss += loss_f(logits, label_ids)

            logits = logits.numpy()
            label_ids = label_ids.numpy()
            val_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = val_accuracy / len(validation_dataloader)
        avg_val_loss = val_loss / len(validation_dataloader)
        val_accuracies.append(avg_val_accuracy)
        val_losses.append(avg_val_loss)
        print("  Average validation loss: {0:.8f}".format(avg_val_loss))
        print("  Average validation accuracy: {0:.8f}".format(avg_val_accuracy))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

        # if np.min(val_losses) == val_losses[-1]:
        print("saving current best checkpoint")
        torch.save(model.state_dict(), "./data/" + args.dir_path + "/" + str(i + 1) + "_mixed_" + args.model_name + "_BM25L_0608.pt")


def inference_model(args):
    test_data = pd.read_csv("./data/new_dataset_0604/processed_test.csv")

    c1 = test_data['code1'].values
    c2 = test_data['code2'].values

    N = test_data.shape[0]
    MAX_LEN = 512

    test_input_ids = np.zeros((N, MAX_LEN), dtype=int)
    test_attention_masks = np.zeros((N, MAX_LEN), dtype=int)

    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
    tokenizer.truncation_side = "left"

    for i in tqdm(range(N), position=0, leave=True):
        try:
            cur_c1 = str(c1[i])
            cur_c2 = str(c2[i])
            encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                      truncation=True)
            test_input_ids[i,] = encoded_input['input_ids']
            test_attention_masks[i,] = encoded_input['attention_mask']

        except Exception as e:
            print(e)
            pass

    test_input_ids = torch.tensor(test_input_ids, dtype=int)
    test_attention_masks = torch.tensor(test_attention_masks, dtype=int)

    if args.save_tensor == True:
        torch.save(test_input_ids, "./data/" + args.dir_path + "/" + "test_input_ids_0605.pt")
        torch.save(test_attention_masks, "./data/" + args.dir_path + "/" + "test_attention_masks_0605.pt")

    model = AutoModelForSequenceClassification.from_pretrained(args.checkpoint_path)
    PATH = "./data/" + args.dir_path + "/" + "1_mixed_" + args.model_name + "_BM25L_0608.pt"

    model.load_state_dict(torch.load(PATH))
    model.cuda()

    test_tensor = TensorDataset(test_input_ids, test_attention_masks)
    test_sampler = SequentialSampler(test_tensor)
    test_dataloader = DataLoader(test_tensor, sampler=test_sampler, batch_size=args.test_batch_size)

    submission = pd.read_csv('./data/sample_submission.csv')
    device = torch.device("cuda")

    preds = np.array([])
    for step, batch in tqdm(enumerate(test_dataloader), desc="Iteration", smoothing=0.05):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu()
        _pred = logits.numpy()
        pred = np.argmax(_pred, axis=1).flatten()
        preds = np.append(preds, pred)

    submission['similar'] = preds
    submission.to_csv('./data/submission_' + args.model_name + '_0610.csv', index=False)


def model_ensemble():
    submission = pd.read_csv('./data/sample_submission.csv')

    submission_1 = pd.read_csv('./data/submission_graphcodebert_BM25L_0610.csv')
    submission_2 = pd.read_csv('./data/submission_CodeBERTaPy_BM25L_0610.csv')
    submission_3 = pd.read_csv('./data/submission_codebert_mlm_BM25L_0610.csv')

    sub_1 = submission_1['similar']
    sub_2 = submission_2['similar']
    sub_3 = submission_3['similar']

    ensemble_preds = (sub_1 + sub_2 + sub_3) / 3

    preds = np.where(ensemble_preds > 0.5, 1, 0)

    submission['similar'] = preds

    submission.to_csv('./data/submission_ensemble_0610_v2.csv', index=False)


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Set arguments.")
# 
#     parser.add_argument("--seed", default="42", type=int, help="Random seed for initialization")
#     parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.")
#     parser.add_argument("--eps", default=1e-5, type=float, help="The initial eps.")
#     parser.add_argument("--epochs", default=3, type=int, help="Total number of epochs to train.")
#     parser.add_argument("--batch_size", type=int, default=None, help="batch_size")
#     parser.add_argument("--test_batch_size", type=int, default=None, help="test_batch_size")
# 
#     parser.add_argument("--no_cuda", default=False, type=bool, help="Say True if you don't want to use cuda.")
#     parser.add_argument("--ensemble", default=False, type=bool, help="Ensemble.")
#     parser.add_argument("--save_tensor", default=True, type=str, help="Save tensor.")
#     parser.add_argument("--mode", default="train", type=str, help="When you train the model.")
#     parser.add_argument("--dir_path", default="graphcodebert", type=str, help="Save model path.")
#     parser.add_argument("--model_name", default="graphcodebert", type=str, help="Model name.")
#     parser.add_argument("--process_name", default="code_similarity", type=str, help="process_name.")
#     parser.add_argument("--checkpoint_path", default="microsoft/graphcodebert-base", type=str, help="Pre-trained Language Model.")
# 
#     args = parser.parse_args()
# 
#     if args.mode == "train":
#         data_preprocess(args)
#         train_model(args)
#     else:
#         inference_model(args)
# 
#     if args.ensemble == True:
#         model_ensemble()
# 
#     # CUDA_VISIBLE_DEVICES=0 python code_submission.py --seed 42 --learning_rate 2e-5 --eps 1e-5 --epochs 3 --batch_size 32 --test_batch_size 1048 --save_tensor True --mode train --dir_path graphcodebert --model_name graphcodebert --process_name code_similarity --checkpoint_path microsoft/graphcodebert-base

In [5]:
import easydict

args = easydict.EasyDict({
    "seed": 42,
    "learning_rate": 2e-5,
    "eps": 1e-5,
    "epochs": 3,
    "batch_size": 32,
    "test_batch_size": 32,
    "save_tensor": False,  # argparse의 'store_true' 액션 대응
    "mode": "train",
    "dir_path": "graphcodebert",
    "model_name": "graphcodebert",
    "process_name": "code_similarity",
    "checkpoint_path": "microsoft/graphcodebert-base"
})

In [5]:
# 데이콘이 제공해준 학습 코드 데이터 데이터프레임 만들기
code_folder = "../data/train_code"  # 데이콘이 제공해준 학습 데이터 파일의 경로
problem_folders = os.listdir(code_folder)
preproc_scripts = []
problem_nums = []

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder))
    problem_num = scripts[0].split('_')[0]
    for script in scripts:
        script_file = os.path.join(code_folder, problem_folder, script)
        preprocessed_script = clean_data(script_file, data_type="dir")
        preproc_scripts.append(preprocessed_script)
    problem_nums.extend([problem_num] * len(scripts))
train_df = pd.DataFrame(data={'code': preproc_scripts, 'problem_num': problem_nums})



100%|██████████| 500/500 [09:11<00:00,  1.10s/it]


In [6]:
# 데이콘이 제공해준 테스트 코드 데이터 데이터프레임 만들기
test_df = pd.read_csv("../data/test.csv")
code1 = test_df['code1'].values
code2 = test_df['code2'].values
processed_code1 = []
processed_code2 = []
for i in tqdm(range(len(code1))):
    processed_c1 = clean_data(code1[i], data_type="file")
    processed_c2 = clean_data(code2[i], data_type="file")
    processed_code1.append(processed_c1)
    processed_code2.append(processed_c2)
processed_test = pd.DataFrame(list(zip(processed_code1, processed_code2)), columns=["code1", "code2"])

100%|██████████| 595000/595000 [00:25<00:00, 23015.23it/s]


In [7]:
dacon_train_df, dacon_valid_df, dacon_train_label, dacon_valid_label = train_test_split(
    train_df,
    train_df['problem_num'],
    random_state=args.seed,
    test_size=0.1,
)

In [8]:
dacon_train_df = dacon_train_df.reset_index(drop=True)
dacon_valid_df = dacon_valid_df.reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
tokenizer.truncation_side = 'left'

dacon_train_bm25L = get_pairs(dacon_train_df, tokenizer)
dacon_valid_bm25L = get_pairs(dacon_valid_df, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 500/500 [4:05:24<00:00, 29.45s/it]  
100%|██████████| 500/500 [25:24<00:00,  3.05s/it]


In [12]:
# 생성된 데이터를 저장합니다. => 이 과정까지의 생성 시간이 꽤 오래걸립니다.
dacon_train_bm25L.to_csv("../data/graph_dacon_train_bm25L.csv", index=False)
dacon_valid_bm25L.to_csv("../data/graph_dacon_valid_bm25L.csv", index=False)
processed_test.to_csv("../data/processed_test.csv", index=False)

OSError: [Errno 28] No space left on device

In [13]:
set_seed(args)
setproctitle(args.process_name)

# dacon_train_data = pd.read_csv("../data/" + "new_dataset_0607/graph_dacon_train_bm25L.csv")
# dacon_valid_data = pd.read_csv("../data/" + "new_dataset_0607/graph_dacon_valid_bm25L.csv")

dacon_train_data = dacon_train_bm25L
dacon_valid_data = dacon_valid_bm25L

In [14]:
train_data = dacon_train_data
valid_data = dacon_valid_data

In [16]:
sample_size = int(len(train_data) * 0.1)

# 무작위로 데이터를 추출합니다.
train_data_sample = train_data.sample(n=sample_size, random_state=42)

In [17]:
sample_size = int(len(valid_data) * 0.1)

# 무작위로 데이터를 추출합니다.
valid_data_sample = valid_data.sample(n=sample_size, random_state=42)

In [19]:
# 생성된 데이터를 저장합니다. => 이 과정까지의 생성 시간이 꽤 오래걸립니다.
train_data_sample.to_csv("../data/graph_dacon_train_bm25L.csv", index=False)
valid_data_sample.to_csv("../data/graph_dacon_valid_bm25L.csv", index=False)
processed_test.to_csv("../data/processed_test.csv", index=False)

In [5]:
dacon_train_data = pd.read_csv("../data/" + "graph_dacon_train_bm25L.csv")
dacon_valid_data = pd.read_csv("../data/" + "graph_dacon_valid_bm25L.csv")

In [6]:
train_data = dacon_train_data
valid_data = dacon_valid_data

In [7]:
sample_size = int(len(train_data) * 0.001)

train_data_sample = train_data.sample(n=sample_size, random_state=42)

sample_size = int(len(valid_data) * 0.1)

valid_data_sample = valid_data.sample(n=sample_size, random_state=42)

In [8]:
train_data = train_data_sample
valid_data = valid_data_sample

In [9]:
# training
c1 = train_data['code1'].values
c2 = train_data['code2'].values
similar = train_data['similar'].values

N = train_data.shape[0]
MAX_LEN = 512

input_ids = np.zeros((N, MAX_LEN), dtype=int)
attention_masks = np.zeros((N, MAX_LEN), dtype=int)
labels = np.zeros((N), dtype=int)

tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)

In [10]:
for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                  truncation=True)
        input_ids[i,] = encoded_input['input_ids']
        attention_masks[i,] = encoded_input['attention_mask']
        labels[i] = similar[i]
    except Exception as e:
        print(e)
        pass

100%|██████████| 10098/10098 [00:24<00:00, 404.16it/s]


In [11]:
# validating
c1 = valid_data['code1'].values
c2 = valid_data['code2'].values
similar = valid_data['similar'].values

N = valid_data.shape[0]

MAX_LEN = 512

valid_input_ids = np.zeros((N, MAX_LEN), dtype=int)
valid_attention_masks = np.zeros((N, MAX_LEN), dtype=int)
valid_labels = np.zeros((N), dtype=int)

for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                  truncation=True)
        valid_input_ids[i,] = encoded_input['input_ids']
        valid_attention_masks[i,] = encoded_input['attention_mask']
        valid_labels[i] = similar[i]
    except Exception as e:
        print(e)
        pass

100%|██████████| 12395/12395 [00:38<00:00, 318.76it/s]


In [12]:
if os.path.exists(args.dir_path):
    os.makedirs(args.dir_path, exist_ok=True)

print("\n\nMake tensor\n\n")
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int)
labels = torch.tensor(labels, dtype=int)

valid_input_ids = torch.tensor(valid_input_ids, dtype=int)
valid_attention_masks = torch.tensor(valid_attention_masks, dtype=int)
valid_labels = torch.tensor(valid_labels, dtype=int)



Make tensor


In [13]:
if args.save_tensor == True:
    torch.save(input_ids, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_input_ids_BM25L_0608.pt')
    torch.save(attention_masks, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_attention_masks_BM25L_0608.pt')
    torch.save(labels, "./data/" + args.dir_path + "/" + args.model_name + '_mixed_train_labels_BM25L_0608.pt')

    torch.save(valid_input_ids, "./data/" + args.dir_path + "/" + args.model_name + "_mixed_valid_input_ids_BM25L_0608.pt")
    torch.save(valid_attention_masks, "./data/" + args.dir_path + "/" + args.model_name + "mixed_valid_attention_masks_BM25L_0608.pt")
    torch.save(valid_labels, "./data/" + args.dir_path + "/" + args.model_name + "mixed_valid_labels_BM25L_0608.pt")

In [14]:
# Setup training
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

train_data = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)

validation_data = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=args.batch_size)

model = AutoModelForSequenceClassification.from_pretrained(args.checkpoint_path)
model.cuda()

optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=1e-5)  # 아직 이게 정확하지 않음

total_steps = len(train_dataloader) * args.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device("cuda")
loss_f = nn.CrossEntropyLoss()

# Train
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []
model.zero_grad()
for i in range(args.epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(i + 1, args.epochs))
    print('Training...')
    t0 = time.time()
    train_loss, train_accuracy = 0, 0
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", smoothing=0.05):
        if step % 10000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('  current average loss = {}'.format(
                train_loss / step))  # bot.sendMessage(chat_id=chat_id, text = '  current average loss = {}'.format(train_loss / step))

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().cpu().numpy()
        train_accuracy += flat_accuracy(logits, label_ids)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
    avg_train_loss = train_loss / len(train_dataloader)
    avg_train_accuracy = train_accuracy / len(train_dataloader)
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)
    print("  Average training loss: {0:.8f}".format(avg_train_loss))
    print("  Average training accuracy: {0:.8f}".format(avg_train_accuracy))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Validating...")
    t0 = time.time()
    model.eval()
    val_loss, val_accuracy = 0, 0
    for step, batch in tqdm(enumerate(validation_dataloader), desc="Iteration", smoothing=0.05):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu()
        label_ids = b_labels.detach().cpu()
        val_loss += loss_f(logits, label_ids)

        logits = logits.numpy()
        label_ids = label_ids.numpy()
        val_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = val_accuracy / len(validation_dataloader)
    avg_val_loss = val_loss / len(validation_dataloader)
    val_accuracies.append(avg_val_accuracy)
    val_losses.append(avg_val_loss)
    print("  Average validation loss: {0:.8f}".format(avg_val_loss))
    print("  Average validation accuracy: {0:.8f}".format(avg_val_accuracy))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    # if np.min(val_losses) == val_losses[-1]:
    print("saving current best checkpoint")
    torch.save(model.state_dict(), "../data/test.pt")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...


Iteration: 316it [2:29:14, 28.34s/it]


  Average training loss: 0.60763824
  Average training accuracy: 0.64292150
  Training epoch took: 2:29:14

Validating...


Iteration: 388it [1:54:27, 17.70s/it]


  Average validation loss: 0.45125189
  Average validation accuracy: 0.78416413
  Training epoch took: 1:54:27
saving current best checkpoint

Training...


Iteration: 316it [1:51:46, 21.22s/it]


  Average training loss: 0.47668629
  Average training accuracy: 0.75681259
  Training epoch took: 1:51:46

Validating...


Iteration: 388it [1:10:22, 10.88s/it]


  Average validation loss: 0.43954161
  Average validation accuracy: 0.76861967
  Training epoch took: 1:10:22
saving current best checkpoint

Training...


Iteration: 316it [1:40:53, 19.16s/it]


  Average training loss: 0.41589699
  Average training accuracy: 0.79513889
  Training epoch took: 1:40:54

Validating...


Iteration: 388it [1:10:22, 10.88s/it]


  Average validation loss: 0.41744193
  Average validation accuracy: 0.79817830
  Training epoch took: 1:10:22
saving current best checkpoint


In [4]:
test_data = pd.read_csv("../data/processed_test.csv")

c1 = test_data['code1'].values
c2 = test_data['code2'].values

N = test_data.shape[0]
MAX_LEN = 512

test_input_ids = np.zeros((N, MAX_LEN), dtype=int)
test_attention_masks = np.zeros((N, MAX_LEN), dtype=int)

tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
tokenizer.truncation_side = "left"

for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                  truncation=True)
        test_input_ids[i,] = encoded_input['input_ids']
        test_attention_masks[i,] = encoded_input['attention_mask']

    except Exception as e:
        print(e)
        pass

test_input_ids = torch.tensor(test_input_ids, dtype=int)
test_attention_masks = torch.tensor(test_attention_masks, dtype=int)

if args.save_tensor == True:
    torch.save(test_input_ids, "./data/" + args.dir_path + "/" + "test_input_ids_0605.pt")
    torch.save(test_attention_masks, "./data/" + args.dir_path + "/" + "test_attention_masks_0605.pt")

model = AutoModelForSequenceClassification.from_pretrained(args.checkpoint_path)
PATH = "../data/test.pt"

model.load_state_dict(torch.load(PATH))
model.cuda()



100%|██████████| 595000/595000 [21:45<00:00, 455.87it/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Iteration: 0it [00:13, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.28 GiB. GPU 0 has a total capacity of 12.00 GiB of which 0 bytes is free. Of the allocated memory 18.91 GiB is allocated by PyTorch, and 54.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
test_tensor = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_tensor)
test_dataloader = DataLoader(test_tensor, sampler=test_sampler, batch_size=args.test_batch_size)

submission = pd.read_csv('../data/sample_submission.csv')
device = torch.device("cuda")

preds = np.array([])
for step, batch in tqdm(enumerate(test_dataloader), desc="Iteration", smoothing=0.05):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu()
    _pred = logits.numpy()
    pred = np.argmax(_pred, axis=1).flatten()
    preds = np.append(preds, pred)

submission['similar'] = preds
submission.to_csv('../submission/bert_1', index=False)

Iteration: 18594it [3:17:01,  1.57it/s]
