## 1. collect requests & responses by file

In [18]:
forget_gap = 9999
lsp_reqs_dir = f"/Users/tannpopo/coding/coding-interfere/strange_identifiers_{forget_gap}_completion_items"
output_file = f"data/strange_identifiers_{forget_gap}_completion_items.json"

import os
import sys
import json

req_map = {}

for lsp_reqs_file in os.listdir(lsp_reqs_dir):
    if lsp_reqs_file.endswith(".json"):
        with open(os.path.join(lsp_reqs_dir, lsp_reqs_file)) as f:
            reqs = json.load(f)
            for req in reqs:
                if req["file_path"] not in req_map:
                    req_map[req["file_path"]] = []
                req_map[req["file_path"]].append(req)

results = {}

for file_path, reqs in req_map.items():
    results[file_path] = []
    for req in reqs:
        trigger_point = (req["strange_identifier"]["end_row"], req["strange_identifier"]["end_col"] - len(req["strange_identifier"]["name"]))
        trigger_byte = req["strange_identifier"]["end_byte"] - len(req["strange_identifier"]["name"])
        completion_items = [('"'+item["detail"]+'"') if "detail" in item else item["completionText"] for item in req["completion_items"]]
        response = "[" + ", ".join(completion_items) + "]"
        results[file_path].append({
            "request_type": "getCompletion",
            "trigger_point": trigger_point,
            "trigger_byte": trigger_byte,
            "response": response
        })

with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

print(f"Output to {output_file}, {len(results)} files")

Output to data/strange_identifiers_9999_completion_items.json, 2808 files


## 2. generate context and completion

In [12]:
def point_in_range(point, start, end):
    return start[0] < point[0] < end[0] or ( start[0] < end[0] and ((start[0] == point[0] and start[1] < point[1]) or (point[0] == end[0] and point[1] < end[1]))) or (start[0] == end[0] and start[1] <= point[1] and point[1] <= end[1])

In [34]:
train_file = f"data/strange_identifiers_{forget_gap}_dot_training_instances.jsonl"
valid_file = f"data/strange_identifiers_{forget_gap}_dot_validation_instances.jsonl"
repo_root = "/Users/tannpopo/coding/coding-interfere/repo_to_mine/"

from tree_sitter import Language, Parser

JA_LANGUAGE = Language("tree_sitter_build/language_set.so", "java")
parser = Parser()
parser.set_language(JA_LANGUAGE)

comp_file = "data/strange_identifiers_9999_completion_items.json"
with open(comp_file) as f:
    comp = json.load(f)

method_decl_query = JA_LANGUAGE.query(
"""
(method_declaration) @method-declaration
""")
valid_cnt = 0
train_cnt = 0
with open(train_file, "w") as f:
    pass
with open(valid_file, "w") as f:
    pass
# file_set = list(comp.keys())
repo_set = set()
for file_path, reqs in comp.items():
    relative_file_path = file_path[len(repo_root):]
    repo_name = relative_file_path.split("/")[0]
    repo_set.add(repo_name)
repo_set = list(repo_set)

import random
random.seed(16)
# valid_file_set = random.sample(file_set, int(len(file_set)*0.1))
valid_repo_set = random.sample(repo_set, int(len(repo_set)*0.1))
for file_path, reqs in comp.items():
    # if file_path in valid_file_set:
    #     output_file = valid_file
    # else:
    #     output_file = train_file
    relative_file_path = file_path[len(repo_root):]
    repo_name = relative_file_path.split("/")[0]
    relative_file = relative_file_path[len(repo_name)+1:]
    print(f"Checking {file_path}")

    if repo_name in valid_repo_set:
        output_file = valid_file
    else:
        output_file = train_file

    with open(file_path) as f:
        code = f.read()
    byte_str = bytes(code, "utf8")
    tree = parser.parse(byte_str)
    method_decls = method_decl_query.captures(tree.root_node)
    _method_decls = []

    for idx, method_decl in enumerate(method_decls):
        if idx == 0:
            _method_decls.append(method_decl)
            continue
        if method_decl[0].end_byte > method_decls[idx-1][0].end_byte:
            _method_decls.append(method_decl)
        
    method_decls = _method_decls

    for method_decl in method_decls:
        method_decl = method_decl[0]
        body = method_decl.child_by_field_name("body")
        if body is None:
            continue

        start_idx = -1
        for idx, req in enumerate(reqs):
            trigger_point = req["trigger_point"]
            if point_in_range(trigger_point, method_decl.start_point, method_decl.end_point):
                start_idx = idx
                break
        
        if start_idx == -1:
            with open(output_file, "a") as f:
                f.write(json.dumps({
                    "repo": repo_name,
                    "file": relative_file,
                    "context": byte_str[:body.start_byte+1].decode("utf8"),
                    "completion": byte_str[body.start_byte+1:method_decl.end_byte].decode("utf8"),
                    "type": "no_api_call",
                })+"\n")
            if output_file == valid_file:
                valid_cnt += 1
            else:
                train_cnt += 1
            continue

        context = byte_str[:body.start_byte+1].decode("utf8")
        last_end_byte = body.start_byte+1
        for idx, req in enumerate(reqs[start_idx:]):
            trigger_point = req["trigger_point"]
            trigger_byte = req["trigger_byte"]
            if point_in_range(trigger_point, method_decl.start_point, method_decl.end_point):
                completion = byte_str[last_end_byte:trigger_byte].decode("utf8")
                completion += f"""<request>LSP::getCompletion()"""
                with open(output_file, "a") as f:
                        f.write(json.dumps({
                        "repo": repo_name,
                        "file": relative_file,
                        "context": context,
                        "completion": completion,
                        "type": "getCompletion:request" if idx == 0 else "getCompletion:generationAndRequest"
                    })+"\n")
                if output_file == valid_file:
                    valid_cnt += 1
                else:
                    train_cnt += 1
                last_end_byte = trigger_byte
                context = byte_str[:trigger_byte].decode("utf8")
                context += completion+f"""<response>{req["response"]}</response></request>"""
            else:
                # completion = byte_str[last_end_byte:method_decl.end_byte].decode("utf8")
                # with open(output_file, "a") as f:
                #     f.write(json.dumps({
                #         "repo": repo_name,
                #         "file": relative_file,
                #         "context": context,
                #         "completion": completion,
                #         "type": "getCompletion:generation"
                #     })+"\n")
                # instance_cnt += 1
                break
    
        completion = byte_str[last_end_byte:method_decl.end_byte].decode("utf8")
        with open(output_file, "a") as f:
            f.write(json.dumps({
                "repo": repo_name,
                "file": relative_file,
                "context": context,
                "completion": completion,
                "type": "getCompletion:generation"
            })+"\n")
            if output_file == valid_file:
                valid_cnt += 1
            else:
                train_cnt += 1

print(f"Output to {output_file}, {train_cnt} train instances, {valid_cnt} valid instances")

TypeError: Population must be a sequence.  For dicts or sets, use sorted(d).