In [0]:
from tqdm import tqdm
import pandas as pd
import glob
import os
import regex as re
import random

In [None]:
path_to_intellij = r'../intellij-community' # path to the intellij community folder
path_to_save = r'./data'

In [3]:
def extract_methods(root_path):
    method_pattern = re.compile(r'\b(?:public|private|protected|static|final|\s)+\w+\s+\w+\([^)]*\)\s*\{(?:[^{}]|(?R))*\}')

    method_texts = []
    files = glob.glob(os.path.join(root_path, '**/*.java'), recursive=True)
    pbar = tqdm(files, desc='Extracting methods', unit='file', leave=True)
    for file in pbar:
        try:
            pbar.set_postfix(file=os.path.basename(file))
            with open(file, 'r', encoding='utf-8', errors='ignore') as f:
                file_text = f.read()
                method_texts += method_pattern.findall(file_text)
        except:
            pass

    return method_texts

In [4]:
samples = extract_methods(path_to_intellij)

Extracting methods:  14%|█▍        | 9480/68678 [00:16<00:53, 1107.49file/s, file=UseIntConstantsFromTargetClassReturnValue.java]                               IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Extracting methods:  19%|█▉        | 13323/68678 [00:20<00:49, 1113.28file/s, file=after2.java]                                                                                        IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Extracting methods:  

In [6]:
def extract_method_name(method_text):
    method_name_pattern = re.compile(r'\b\w+\s+(\w+)(?=\([^)]*\))')
    match = method_name_pattern.search(method_text)
    return match.group(1) if match else None

In [7]:
samples = [[s, extract_method_name(s)] for s in samples]
samples = [s for s in samples if s[1] is not None]

In [8]:
random.shuffle(samples)
train_samples = samples[:int(len(samples)*0.6)]
val_samples = samples[int(len(samples)*0.6):int(len(samples)*0.8)]
test_samples = samples[int(len(samples)*0.8):]

In [9]:
files = ['train.csv', 'val.csv', 'test.csv']
samples = [train_samples, val_samples, test_samples]

for file, sample in zip(files, samples):
    df = pd.DataFrame(sample, columns=['method', 'method_name'])
    df.to_csv(os.path.join(path_to_save, file), index=True)