Explore the vault dataset. The Vault has 3 sub-dataset: Class, Method and Inline.

### THE VAULT FUNCTION

In [1]:
from datasets import config

print(config.HF_DATASETS_CACHE)

/Users/dongvanvo/.cache/huggingface/datasets


In [2]:
from datasets import load_dataset

dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=["train/small"], languages=["java"], streaming=False)
dataset

README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

the-vault-function.py:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

java-00000-of-00002.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

java-00001-of-00002.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train_small split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train_small: Dataset({
        features: ['hexsha', 'repo', 'path', 'license', 'language', 'identifier', 'return_type', 'original_string', 'original_docstring', 'docstring', 'docstring_tokens', 'code', 'code_tokens', 'short_docstring', 'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
        num_rows: 351213
    })
})

In [None]:
i = 0
for sample in iter(dataset['train_small']): 
    code = sample['code']
    docstring = sample['docstring']
    project_name = sample['repo']
    method_name = sample['identifier']
    print(f"Method '{method_name}' of project '{project_name}'")
    print(f"comment:\n{docstring}")
    print(code)
    i += 1
    if i >= 100:
        break

Process The Vault function: <br>
The Vault function has these data fields: <br>
    hexsha (string): the unique git hash of file <br>
    repo (string): the owner/repo <br>
    path (string): the full path to the original file <br>
    license (list): licenses in the repo <br>
    language (string): the programming language <br>
    identifier (string): the function or method name <br>
    return_type (string): the type returned by the function <br>
    original_string (string): original version of function/class node <br>
    original_docstring (string): the raw string before tokenization or parsing <br>
    code (string): the part of the original that is code <br>
    code_tokens (list): tokenized version of code <br>
    short_docstring (string): short, brief summarization (first line of the docstring) <br>
    short_docstring_tokens (list): tokenized version of `short_docstring <br>
    docstring (string): the top-level comment or docstring (docstring version without param’s doc, return, exception fields, etc) <br>
    docstring_tokens (list): tokenized version of docstring <br>
    comment (list): list of comments (line) inside the function/class <br>
    parameters (list): List of parameters and its type (type can be None) <br>
    docstring_params (dict): Dictionary of the parsed information from docstring <br>
My plan for preprocessing function should accept these option: <br>
- split_set: split_set option in The Vault dataset on huggingface <br>
- languages option: This is also an option in The Vault. Separate each language in different JSON object if there are multiple language <br>
- streaming option: streamming option in The Vault dataset on huggingface <br>
- num_proj option: custom option. Keep maximum of num_proj only. Cannot be used together with num_method.
- num_method option: custom option. Keep maximum of num_method only. Cannot be used together with num_proj.
- JSON object contains code, code_tokens, docstring, repo, and identifier.

In [None]:
from datasets import load_dataset
from collections import defaultdict
import json
import os

def the_vault_function_to_json(
    split_set="train",
    languages=["java"],
    streaming=True,
    num_proj=None,
    num_method=None,
    write_to_file=False,
    output_dir="data/vault_function_json"
):
    assert not (num_proj and num_method), "Cannot use both num_proj and num_method"

    if write_to_file:
        os.makedirs(output_dir, exist_ok=True)

    data_lang = {}
    for lang in languages:
        dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=[split_set], languages=[lang], streaming=streaming)
        if split_set == "train/small":
            split_set = "train_small"
        elif split_set == "train/medium":
            split_set = "train_medium"
        dataset = dataset[split_set]
        print(f"Loaded {lang}, split={split_set}, streaming={streaming}")

        data = []
        proj_counts = defaultdict(int)
        seen_projects = set()
        for sample in dataset:
            if num_proj and len(seen_projects) >= num_proj:
                break
            if num_method and len(data) >= num_method:
                break
            item = {
                "code": sample["code"],
                "code_tokens": sample["code_tokens"]
                "docstring": sample["docstring"],
                "project": sample["repo"],
                "name": sample["identifier"]
            }
            data.append(item)
            project = item["project"]
            if num_proj:
                seen_projects.add(project)
            proj_counts[project] += 1

        print(f"{lang} - collected {len(data)} samples across {len(seen_projects)} projects")

        if write_to_file:
            output_path = os.path.join(output_dir, f"{lang}_{split_set}.jsonl")
            with open(output_path, "w") as f:
                for item in data:
                    f.write(json.dumps(item) + "\n")
            print(f"Saved to {output_path}")
        data_lang[lang] = data
    return data_lang

In [None]:
data_lang = preprocess_the_vault_function(split_set="train/small", languages=["java"], num_proj=100)
print(data_lang)

### THE VAULT INLINE

In [None]:
from datasets import load_dataset

dataset = load_dataset("Fsoft-AIC/the-vault-inline", languages=["java"], streaming=False)
dataset

In [None]:
i = 0
prev_method_name = None
for sample in dataset['train']:
    code = sample['code']
    comment = sample['comment']
    method_name = sample['identifier']
    if method_name != prev_method_name:
        print("\n")
        print(code)
    prev_context = sample['prev_context']
    next_context = sample['next_context']
    print(f"{comment}")
    prev_method_name = method_name
    i += 1
    if i >= 100:
        break

### THE VAULT CLASS

In [None]:
from datasets import load_dataset

dataset = load_dataset("Fsoft-AIC/the-vault-class", languages=['java'], streaming=True)
dataset

In [None]:
i = 0
for sample in dataset['train']:
    code = sample['code']
    docstring = sample['docstring']
    project_name = sample['repo']
    method_name = sample['identifier']
    print(f"Class '{method_name}' of project '{project_name}'")
    print(f"comment:\n{docstring}")
    print(code)
    i += 1
    if i >= 100:
        break