In [136]:
import glob
import os
import regex as re
from tqdm import tqdm

def extract_methods(root_path):
    method_pattern = re.compile(r'\b(?:public|private|protected|static|final|\s)+\w+\s+\w+\([^)]*\)\s*\{(?:[^{}]|(?R))*\}')

    method_texts = []
    files = glob.glob(os.path.join(root_path, '**/*.java'), recursive=True)
    pbar = tqdm(files, desc='Extracting methods', unit='file', leave=True)
    for file in pbar:
        pbar.set_postfix(file=os.path.basename(file))
        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            file_text = f.read()
            method_texts += method_pattern.findall(file_text)

        if len(method_texts) > 1000:
            break

    return method_texts

In [137]:
path_to_intellij = r'' # path to the intellij community folder
samples = extract_methods(path_to_intellij)

Extracting methods:   0%|          | 308/68678 [00:00<02:31, 451.41file/s, file=JavacConfigurable.java]                              


In [138]:
# create a function to extract method name from a given method text
def extract_method_name(method_text):
    # Regular expression to extract the method name
    method_name_pattern = re.compile(r'\b\w+\s+(\w+)(?=\([^)]*\))')
    # Match the method name
    match = method_name_pattern.search(method_text)
    # Extract and return the method name (group 1)
    return match.group(1) if match else None

In [139]:
# extract method names from the samples
samples = [(s, extract_method_name(s)) for s in samples]

In [140]:
# shuffle and split in 60% train, 20% validation and 20% test
import random
random.shuffle(samples)
train_samples = samples[:int(len(samples)*0.6)]
val_samples = samples[int(len(samples)*0.6):int(len(samples)*0.8)]
test_samples = samples[int(len(samples)*0.8):]

In [141]:
import pandas as pd
path_to_save = r'' # path to save the csv files
files = ['train.csv', 'val.csv', 'test.csv']
samples = [train_samples, val_samples, test_samples]

for file, sample in zip(files, samples):
    df = pd.DataFrame(sample, columns=['method', 'method_name'])
    df.to_csv(os.path.join(path_to_save, file), index=False)
