In [3]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import sys
import numpy as np

sys.path.append('../')
from formatter.utils.parsing import parse_prompt, get_code_edit_patches, get_file_path, get_file_paths_parallel, process_dataset_to_jsonl

dataset = load_dataset("microsoft/NextCoderDataset", split="train")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
parsed = parse_prompt(dataset['prompt'][100])
parsed

{'initial_part': 'Rewrite the given c program as per the following instruction.',
 'instruction': 'Improve packet sniffer efficiency with a callback function, add thread safety with a mutex, and update protocol checks in the `packet_callback` function. Modify `capture_packets` to use the callback and manage the mutex.\nWrite the entire code and no other text in the response.',
 'code_block': '#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <stdint.h>\n#include <pcap.h>\n#include <arpa/inet.h>\n\n// Structure to hold packet capture statistics\ntypedef struct {\n    int total_packets;\n    int filtered_packets;\n    int unfiltered_packets;\n} packet_stats;\n\n// Function to capture and display packets\nvoid capture_packets(char *interface, char *filter) {\n    pcap_t *handle;\n    char errbuf[PCAP_ERRBUF_SIZE];\n    struct bpf_program fp;\n    bpf_u_int32 mask;\n    bpf_u_int32 net;\n    int num_packets = 0;\n    packet_stats stats = {0, 0, 0};\n\n    // Open the n

In [5]:
tuples = get_code_edit_patches(parsed['code_block'], '\n'.join(dataset['completion'][0].split('\n')[1:-1]), )
tuples

[('#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <stdint.h>\n#include <pcap.h>\n#include <arpa/inet.h>\n\n// Structure to hold packet capture statistics\ntypedef struct {\n    int total_packets;\n    int filtered_packets;\n    int unfiltered_packets;\n} packet_stats;\n\n// Function to capture and display packets\nvoid capture_packets(char *interface, char *filter) {\n    pcap_t *handle;\n    char errbuf[PCAP_ERRBUF_SIZE];\n    struct bpf_program fp;\n    bpf_u_int32 mask;\n    bpf_u_int32 net;\n    int num_packets = 0;\n    packet_stats stats = {0, 0, 0};\n\n    // Open the network interface for packet capture\n    if ((handle = pcap_open_live(interface, BUFSIZ, 1, 1000, errbuf)) == NULL) {\n        printf("Error opening interface: %s\\n", errbuf);\n        return;\n    }\n\n    // Get the network address and mask\n    if (pcap_lookupnet(interface, &net, &mask, errbuf) == -1) {\n        printf("Error getting network address and mask: %s\\n", errbuf);\n        p

In [6]:
parsed = parse_prompt(dataset['prompt'][2])
file_path = get_file_path(parsed['code_block'], parsed['language'],
        base_url='https://ea0383f40cbb.ngrok-free.app',
        model="cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ",
        print_assistant_message=True)


<file_path>src/banking/account_manager.c</file_path>


In [7]:
def example_parallel_usage(n=4):
    ids = np.random.randint(0, len(dataset), size=n)
    code_blocks = []
    for i in ids:
        parsed = parse_prompt(dataset['prompt'][int(i)])
        code_blocks.append((parsed['code_block'], parsed['language'], int(i)))
    
    results = get_file_paths_parallel(code_blocks, max_workers=n,
    base_url='https://ea0383f40cbb.ngrok-free.app', model="cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ")
    
    for i, (code_block, language, _) in enumerate(code_blocks):
        print(f"Code block {i+1} ({language}): {results[i]}")
example_parallel_usage(8)

Code block 1 (kotlin): (339440, 'src/main/kotlin/pcb/uwu/unit/conversion/LengthConverter.kt')
Code block 2 (java): (258884, 'src/main/java/data/lab/elasticsearch/operation/sql/AdvancedEsIndexSqlImp.java')
Code block 3 (java): (272695, 'app/src/main/java/com/example/app/MainActivity.java')
Code block 4 (go): (48930, 'cmd/task_manager/main.go')
Code block 5 (python): (147581, 'src/news_aggregator/news_feed_reader.py')
Code block 6 (go): (54090, 'pkg/monitoring/server_metrics.go')
Code block 7 (rust): (170019, 'src/shapes/collisions.rs')
Code block 8 (rust): (316570, 'src/lib.rs')


In [16]:
process_dataset_to_jsonl(dataset.select(range(240)), 'test.jsonl',
    base_url='https://ea0383f40cbb.ngrok-free.app', model="cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ" )

Found 120 existing file ids in test.jsonl
Processed 128/240 entries. Added: 8, Skipped: 120, Errors: 0
Processed 144/240 entries. Added: 24, Skipped: 120, Errors: 0
Processed 160/240 entries. Added: 40, Skipped: 120, Errors: 0
Processed 176/240 entries. Added: 56, Skipped: 120, Errors: 0
Processed 192/240 entries. Added: 72, Skipped: 120, Errors: 0
Processed 208/240 entries. Added: 88, Skipped: 120, Errors: 0
Processed 224/240 entries. Added: 104, Skipped: 120, Errors: 0
Processed 240/240 entries. Added: 120, Skipped: 120, Errors: 0

Processing complete!
Total entries processed: 240
New entries added: 120
Entries skipped (duplicates): 120
Errors encountered: 0


Dataset({
    features: ['prompt', 'completion'],
    num_rows: 100
})