In [1]:
from datasets import config
print(config.HF_DATASETS_CACHE)

/home/quanvo/.cache/huggingface/datasets


## PROCESS DATA FOR FINAL EVALUATION<br>
In The Vault dataset, there are hundred thousands to millions samples for each programming language. Evaluate LLMs on all samples are prohibitedly costly. To compare different methods with each others, some hundreds (200 per language in LLM4CodeSummarization) are enough.
In this project, I want to answer: <br>
    <b>RQ1: How effective are in-context learning using LLMs in code comment generation? <br>
	RQ2: Does project-specific context improve the performance of LLMs? <br>
	RQ3: Does retrieval-augmented generation improve the performance of LLMs? <br>
	RQ4: How do LLMs perform for different programming languages? <br></b>
To evaluate LLMs for RQ1, RQ2 and RQ3, I collect all functions from 100 different projects, each project needs to have at least 5 functions in The Vault to be considered.
data structure look like this: <br>
<b>ProjectObject:</b><br>
    {
    "project_name": name of the project, corresponding to "repo" in The Vault, <br>
    "functions": List[FunctionObject]. List of functions belongs to this project<br>
    } <br>
<b>FunctionObject</b>: <br>
    {
        "code": code of function, corresponding to "code" in The Vault, <br>
        "docstring": full docstring of function, corresponding to "docstring" in The Vault, <br>
        "project": name of the project, corresponding to "repo" in The Vault, <br>
        "code_tokens": list of tokens, corresponding to "code_tokens" in The Vault, <br>
        "code_tokens_processed": list of tokens after some processing for bm25 retrieval, <br>
        "bm25": list of code and docstring of 5 most similar functions retrieved using bm25, <br>
        "CodeBERT": list of code and docstring of 5 most similar functions retrieved using CodeBERT embedding, <br>
    }

<b>STEP 1: Collect functions from 200 projects</b>

In [2]:
from datasets import load_dataset
from collections import defaultdict
import json
import os

def the_vault_function_to_json(
    split_set="test",
    languages=["java"],
    streaming=True,
    num_proj=None,
    num_method=None,
    min_num_functions=5, #keep only project with at least min_num_functions functions
    write_to_file=False,
    output_dir="../data"
):
    """
    Loads and processes functions from The Vault dataset, grouping them by project.

    This function fetches code samples from the `Fsoft-AIC/the-vault-function` dataset
    using the Hugging Face `datasets` library. It groups functions by project and filters 
    them based on a minimum number of functions per project. The output can optionally 
    be saved to disk as JSONL files.

    Args:
        split_set (str): Dataset split to load (e.g., "train", "train/small", "train/medium").
        languages (List[str]): List of programming languages to load (e.g., ["java"]).
        streaming (bool): Whether to stream the dataset instead of loading fully into memory.
        num_proj (int, optional): Maximum number of unique projects to collect. Cannot be used with `num_method`.
        num_method (int, optional): Maximum number of individual methods/functions to collect. Cannot be used with `num_proj`.
        min_num_functions (int): Minimum number of functions a project must have to be included.
        write_to_file (bool): Whether to write the processed data to JSONL files.
        output_dir (str): Directory where JSONL files will be saved if `write_to_file` is True.

    Returns:
        dict: A dictionary mapping each language to a list of project entries. 
              Each entry is a dict with keys `"project_name"` and `"functions"`, 
              where `"functions"` is a list of function metadata.
              
    Raises:
        AssertionError: If both `num_proj` and `num_method` are provided.
    """
    
    assert not (num_proj and num_method), "Cannot use both num_proj and num_method"

    if write_to_file:
        os.makedirs(output_dir, exist_ok=True)

    data_lang = {}
    for lang in languages:
        dataset = load_dataset("Fsoft-AIC/the-vault-function", 
                               split_set=[split_set], 
                               languages=[lang], 
                               streaming=False,
                               trust_remote_code=True)
        if split_set == "train/small":
            split_set = "train_small"
        elif split_set == "train/medium":
            split_set = "train_medium"
        dataset = dataset[split_set]
        print(f"Loaded {lang}, split={split_set}, streaming={streaming}")

        data = []
        project_function = []
        project = ""
        proj_counts = defaultdict(int)
        seen_projects = set()
        total = 0
        i = 0
        for sample in dataset:
            i += 1
            #print(f"{i}: {len(seen_projects)}")
            if num_proj and len(seen_projects) >= num_proj:
                break
            if num_method and len(data) >= num_method:
                break
            item = {
                "code": sample["code"],
                "code_tokens": sample["code_tokens"],
                "docstring": sample["docstring"],
                "project": sample["repo"],
                "name": sample["identifier"]
            }
            # Encounter new project: append project_function to data, refresh project_function
            if project != item["project"] and project != "":
                if len(project_function) > min_num_functions:
                    data.append({"project_name": project, "functions": project_function})
                    total += len(project_function)
                    if num_proj:
                        seen_projects.add(project)
                project_function = []
            project = item["project"]
            project_function.append(item)
            proj_counts[project] += 1

        print(f"{lang} - collected {total} samples across {len(seen_projects)} projects")

        if write_to_file:
            output_path = os.path.join(output_dir, f"{lang}_{split_set}.jsonl")
            with open(output_path, "w") as f:
                for item in data:
                    f.write(json.dumps(item) + "\n")
            print(f"Saved to {output_path}")
        data_lang[lang] = data
    return data_lang

In [3]:
from datasets import load_dataset
from collections import defaultdict
import json
import os

def the_vault_class_to_json(
    languages=["java"],
    streaming=False,
    num_proj=None,
    num_class=None,
    min_num_classes=5, #keep only project with at least min_num_classes classes
    write_to_file=False,
    output_dir="../data"
):
    """
    Loads and processes classes from The Vault dataset, grouping them by project.

    This function fetches code samples from the `Fsoft-AIC/the-vault-class` dataset
    using the Hugging Face `datasets` library. It groups classes by project and filters 
    them based on a minimum number of classes per project. The output can optionally 
    be saved to disk as JSONL files.

    Args:
        split_set (str): Dataset split to load (e.g., "train", "validation", "test").
        languages (List[str]): List of programming languages to load (e.g., ["java"]).
        streaming (bool): Whether to stream the dataset instead of loading fully into memory.
        num_proj (int, optional): Maximum number of unique projects to collect. Cannot be used with `num_class`.
        num_class (int, optional): Maximum number of individual classes to collect. Cannot be used with `num_proj`.
        min_num_classes (int): Minimum number of classes a project must have to be included.
        write_to_file (bool): Whether to write the processed data to JSONL files.
        output_dir (str): Directory where JSONL files will be saved if `write_to_file` is True.

    Returns:
        dict: A dictionary mapping each language to a list of project entries. 
              Each entry is a dict with keys `"project_name"` and `"classes"`, 
              where `"classes"` is a list of class metadata.
              
    Raises:
        AssertionError: If both `num_proj` and `num_class` are provided.
    """
    
    assert not (num_proj and num_class), "Cannot use both num_proj and num_class"

    if write_to_file:
        os.makedirs(output_dir, exist_ok=True)

    data_lang = {}
    for lang in languages:
        dataset = load_dataset("/home/quanvo/Documents/van-vo-projects/llm-code-comment-gen/the-vault-class", 
                               languages=[lang], 
                               streaming=streaming,
                               trust_remote_code=True)
        dataset = dataset['train']
        print(f"Loaded {lang}, streaming={streaming}")

        data = []
        project_class = []
        project = ""
        proj_counts = defaultdict(int)
        seen_projects = set()
        i = 0
        total = 0
        for sample in dataset:
            i += 1
            #print(f"{i}: {len(seen_projects)}")
            if num_proj and len(seen_projects) >= num_proj:
                break
            if num_class and len(data) >= num_class:
                break
            item = {
                "code": sample["code"].replace(sample["original_docstring"], ""),
                "code_tokens": sample["code_tokens"],
                "docstring": sample["docstring"],
                "project": sample["repo"],
                "name": sample["identifier"]
            }
            # Encounter new project: append project_class to data, refresh project_class
            if project != item["project"] and project != "":
                if len(project_class) > min_num_classes:
                    data.append({"project_name": project, "classes": project_class})
                    total += len(project_class)
                    if num_proj:
                        seen_projects.add(project)
                project_class = []
            project = item["project"]
            project_class.append(item)
            proj_counts[project] += 1

        print(f"{lang} - collected {total} samples across {len(seen_projects)} projects")

        if write_to_file:
            output_path = os.path.join(output_dir, f"{lang}_{split_set}.jsonl")
            with open(output_path, "w") as f:
                for item in data:
                    f.write(json.dumps(item) + "\n")
            print(f"Saved to {output_path}")
        data_lang[lang] = data
    return data_lang

In [4]:
#option 0: inline, 1: functions, 2: classes
option = 2
lang = "java"

if option == 0:
    pass
elif option == 1:
    data_lang = the_vault_function_to_json(split_set="test", 
                                           languages=[lang], 
                                           num_proj=200, 
                                           write_to_file=False)
elif option == 2:
    data_lang = the_vault_class_to_json(languages=[lang], 
                                    num_proj=200,
                                    streaming=True,
                                    write_to_file=False)
data = data_lang[lang]

Loaded java, streaming=True
java - collected 2170 samples across 200 projects


<b>Step 2: Process code tokens </b>

In [5]:
import re
import keyword

def split_identifier(token):
    # Split snake_case and then camelCase
    parts = re.split(r'[_]', token)
    split_parts = []
    for part in parts:
        # Split camelCase and PascalCase using lookahead and lookbehind
        camel_parts = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?![a-z])', part)
        split_parts.extend(camel_parts)
    return split_parts

JAVA_KEYWORDS = {
    'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch', 'char',
    'class', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum',
    'extends', 'final', 'finally', 'float', 'for', 'goto', 'if', 'implements',
    'import', 'instanceof', 'int', 'interface', 'long', 'native', 'new',
    'package', 'private', 'protected', 'public', 'return', 'short', 'static',
    'strictfp', 'super', 'switch', 'synchronized', 'this', 'throw', 'throws',
    'transient', 'try', 'void', 'volatile', 'while', 'true', 'false', 'null'
}

JAVA_API_COMMON = {
    'System', 'out', 'in', 'err', 'print', 'println', 'printf', 'String', 'Integer',
    'List', 'ArrayList', 'Map', 'HashMap', 'Set', 'HashSet', 'Math', 'Object',
}

PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";", "(", ")", 
               "[", "]", "=", ">", "<", "+", "-", "/", "*", ">=", "<=", "==", "+=", "-=", "/=", "*="]

def is_stopword_java(token):
    token_lower = token.lower()
    return (token in JAVA_KEYWORDS or token in JAVA_API_COMMON or token in PUNCTUATIONS)

def process_tokens_java(tokens):
    split_tokens = []
    for token in tokens:
        if re.match(r'^[A-Za-z_]+$', token):
            split_tokens.extend(split_identifier(token))
        else:
            split_tokens.append(token)
    split_tokens = [t for t in split_tokens if not is_stopword_java(t)]
    return split_tokens

#################################PYTHON##############################################
PYTHON_KEYWORDS = set(keyword.kwlist)

PYTHON_BUILTINS = {
    'print', 'input', 'len', 'range', 'open', 'map', 'filter', 'zip', 'int', 'float', 'str',
    'list', 'dict', 'set', 'tuple', 'type', 'isinstance', 'enumerate', 'sorted', 'reversed',
    'sum', 'min', 'max', 'abs', 'any', 'all', 'bool', 'dir', 'divmod', 'id', 'hex', 'oct',
    'ord', 'chr', 'pow', 'round', 'slice', 'vars', 'format', 'next', 'iter'
}

def is_stopword_python(token):
    token_lower = token.lower()
    return (token in PYTHON_KEYWORDS or token in PYTHON_BUILTINS or token in PUNCTUATIONS)

def process_tokens_python(tokens):
    split_tokens = []
    for token in tokens:
        if re.match(r'^[A-Za-z_]+$', token):
            split_tokens.extend(split_identifier(token))
        else:
            split_tokens.append(token)
    split_tokens = [t for t in split_tokens if not is_stopword_python(t)]
    return split_tokens

def process_tokens(tokens, lang):
    match lang:
        case 'java':
            return process_tokens_java(tokens)
        case 'python':
            return process_tokens_python(tokens)
        case _:
            print("Invalid language")
            return None

In [6]:
if option == 0:
    data_type = 'snippet'
elif option == 1:
    data_type = 'functions'
else:
    data_type = 'classes'
for i in range(0, len(data)):
    for j in range(0, len(data[i][data_type])):
        tokens = data[i][data_type][j]['code_tokens']
        data[i][data_type][j]['code_tokens_processed'] = process_tokens(tokens, lang)

In [7]:
data[0][data_type][0]['code_tokens_processed']

['First',
 'Page',
 'Loading',
 'Partial',
 'State',
 'Changes',
 '{',
 '@',
 'Override',
 'to',
 '{',
 '"',
 'FirstPageLoadingState{}',
 '"',
 '}',
 '}']

<b>Step 3: run bm25 </b> <br>
In this step, I use bm25s (https://github.com/xhluca/bm25s/tree/main), for faster bm25. However, the retrieve() method of bm25s does not return indices of retrieved documents, so we cannot find corresponding docstring. Therefore, I have to modify the code. In bm25s/bm25s/__init__.py file,  change "allowed_return_as = ["tuple", "documents"]" on line 685 to "allowed_return_as = ["tuple", "documents", "tuple_custom", "documents_custom"]". On lines 853-858, insert: <br>
elif return_as == "tuple_custom": <br>
    return Results(documents=retrieved_docs, scores=scores), indices <br>
elif return_as == "documents_custom": <br>
    return retrieved_docs, indices <br>
Then, I use indices to get docstring. Below is the code example.

In [None]:
import bm25s
if option == 0:
    pass
elif option == 1:
    dataset = load_dataset("Fsoft-AIC/the-vault-function", 
                           split_set=["train/small"], 
                           languages=[lang], 
                           streaming=False,
                           trust_remote_code=True)
    dataset = dataset["train_small"]
elif option == 2:
    dataset = load_dataset("/home/quanvo/Documents/van-vo-projects/llm-code-comment-gen/the-vault-class", 
        languages=[lang], 
        streaming=True,
        trust_remote_code=True)
    dataset = dataset["train"]

# UNCOMMENTED TO RUN
print("create corpus")
corpus = []
corpus_tokens = []
i = 0
for sample in dataset:
    if i % 10000 == 0:
        print(i)
    i += 1
    corpus.append(sample["code"])
    corpus_tokens.append(process_tokens(sample["code_tokens"], lang))
# corpus = [sample["code"] for sample in dataset]
# corpus_tokens = [process_tokens(sample["code_tokens"], lang) for sample in dataset]

print("indexing...")
retriever = bm25s.BM25(corpus=corpus)
retriever.index(corpus_tokens)

# query_tokens = data[0]['functions'][5]['code_tokens_processed']
# res, indices = retriever.retrieve([query_tokens], k=5, return_as="tuple_custom")
# docs = res.documents
# scores = res.scores
# print(f"Best result (score: {scores[0, 4]:.2f}): {docs[0, 4]}, index: {indices[0, 4]}")

# UNCOMMENTED TO RUN
for i in range(len(data)):
    for j in range(len(data[i][data_type])):
        print(f"{i}: {j}")
        query_tokens = data[i][data_type][j]['code_tokens_processed']
        if option == 1:
            res, indices = retriever.retrieve([query_tokens], k=5, return_as="tuple_custom")
            indices = indices[0, 0:]
        else:
            res, indices = retriever.retrieve([query_tokens], k=6, return_as="tuple_custom")
            indices = indices[0, 1:]
        docs = res.documents
        scores = res.scores
        retrieved_examples = [{'code': dataset[ind.item()]['code'],
                               'docstring': dataset[ind.item()]['docstring'],
                               'code_tokens': dataset[ind.item()]['code_tokens'],} for ind in indices]
        data[i][data_type][j]['bm25'] = retrieved_examples

create corpus
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370

In [None]:
output_dir = '.'
output_path = None
if option == 1:
    output_path = os.path.join(output_dir, f"{lang}-test-train-small.jsonl")
    with open(output_path, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")
elif option == 2:
    output_path = os.path.join(output_dir, f"{lang}-class.jsonl")
    with open(output_path, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")
print(f"Saved to {output_path}")

<b>Step 4: CodeBERT embedding </b>

In [None]:
# dataset = load_dataset("Fsoft-AIC/the-vault-function", 
#                        split_set=["train/small"], 
#                        languages=[lang], 
#                        streaming=False,
#                        trust_remote_code=True)

In [None]:
import numpy as np
import torch
import time
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

codes = [item["code"] for item in dataset
batch_size = 32

# Run through model
embeddings = []
i = 0
print(len(dataset))
start = time.time()
for i in range(0, len(codes), batch_size):
    batch_codes = codes[i:i + batch_size]
    print(i)
    inputs = tokenizer(batch_codes, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings.append(embedding.cpu().numpy())
end = time.time()
print(end - time.time())

embeddings_np = np.array(embeddings, dtype=object)
embeddings_np = np.vstack(embeddings_np)
if option == 1:
    pass
    #np.save('embedding_func_{lang}_train_small.npy', embeddings_np, allow_pickle=True)
elif option == 2:
    pass
    np.save(f'embedding_class_{lang}.npy', embeddings_np, allow_pickle=True)

In [None]:
import faiss

if option == 1:
    embeddings_np = np.load(f'embedding_func_{lang}_train_small.npy', allow_pickle=True)
elif option == 2:
    embeddings_np = np.load(f'embedding_class_{lang}.npy', allow_pickle=True)
embeddings_np = np.vstack(embeddings_np)
print(embeddings_np.shape)

In [None]:
faiss.normalize_L2(embeddings_np)
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatIP(dimension)

if faiss.get_num_gpus() > 0:
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

print("indexing...")
index.add(embeddings_np)

In [None]:
query_code = data[0][data_type][5]['code']
query_input = tokenizer(query_code, padding=True, truncation=True, return_tensors="pt").to(device)
query_embedding = model(**query_input).last_hidden_state[:, 0, :].detach().cpu().numpy()

k = 5
distances, indices = index.search(np.array(query_embedding), k)

retrieved_examples = [dataset[i.item()]["code"] for i in indices[0]]
print(retrieved_examples[0])

In [None]:
for i in range(len(data)):
    for j in range(len(data[i][data_type])):
        print(f"{i}: {j}")
        query_code = data[i][data_type][j]['code']
        query_input = tokenizer(query_code, padding=True, truncation=True, return_tensors="pt").to(device)
        query_embedding = model(**query_input).last_hidden_state[:, 0, :].detach().cpu().numpy()

        if option == 1:
            k = 5
            distances, indices = index.search(np.array(query_embedding), k)
        else:
            k = 6
            distances, indices = index.search(np.array(query_embedding), k)
            indices[0] = indices[0, 1:]
        retrieved_examples = [{'code': dataset[ind.item()]['code'],
                               'docstring': dataset[ind.item()]['docstring'],
                               'code_tokens': dataset[ind.item()]['code_tokens'],} for ind in indices[0]]
        data[i][data_type][j]['CodeBERT'] = retrieved_examples

In [None]:
output_dir = '.'
if option == 1:
    output_path = os.path.join(output_dir, f"{lang}-test-(train_small).jsonl")
elif option == 2:
    output_path = os.path.join(output_dir, f"{lang}-class.jsonl")
with open(output_path, "w") as f:
    for item in data:
        f.write(json.dumps(item) + "\n")
print(f"Saved to {output_path}")