In [5]:
from tqdm import tqdm
import regex as re
import glob
import os
import pandas as pd
import random

In [6]:
path_to_intellij = r'../../intellij-community' # path to the intellij community folder
path_to_save = r'../data' # path to save the data

In [7]:
def extract_methods(root_path):
    method_pattern = re.compile(r'\b(?:public|private|protected|static|final|\s)+\w+\s+\w+\([^)]*\)\s*\{(?:[^{}]|(?R))*\}')

    method_texts = []
    files = glob.glob(os.path.join(root_path, '**/*.java'), recursive=True)
    pbar = tqdm(files, desc='Extracting methods', unit='file', leave=True)
    for file in pbar:
        try:
            pbar.set_postfix(file=os.path.basename(file))
            with open(file, 'r', encoding='utf-8', errors='ignore') as f:
                file_text = f.read()
                method_texts += method_pattern.findall(file_text)
        except:
            pass

    return method_texts

In [8]:
samples = extract_methods(path_to_intellij)

Extracting methods: 100%|██████████| 68678/68678 [02:39<00:00, 430.33file/s, file=UnnecessaryNonCapturingGroupInspectionTest.java]                                                    


In [9]:
def extract_method_name(method_text):
    method_name_pattern = re.compile(r'\b\w+\s+(\w+)(?=\([^)]*\))')
    match = method_name_pattern.search(method_text)
    return match.group(1) if match else None

In [10]:
samples = [[s, extract_method_name(s)] for s in samples]
samples = [s for s in samples if s[1] is not None]

In [11]:
random.shuffle(samples)
train_samples = samples[:int(len(samples)*0.6)]
val_samples = samples[int(len(samples)*0.6):int(len(samples)*0.8)]
test_samples = samples[int(len(samples)*0.8):]

In [12]:
files = ['intellij-train.csv', 'intellij-val.csv', 'intellij-test.csv']
samples = [train_samples, val_samples, test_samples]

for file, sample in zip(files, samples):
    df = pd.DataFrame(sample, columns=['code', 'name'])
    df.to_csv(os.path.join(path_to_save, file), index=False)