In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset, Dataset
import sys
import numpy as np
import pandas as pd
import json

sys.path.append('../')
from formatter.utils.parsing import parse_prompt

dataset = load_dataset("microsoft/NextCoderDataset", split="train")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


{'prompt': 'Rewrite the given c program as per the following instruction.\nTo improve the code:\n1. Replace float with double for the account balance to handle precision issues.\n2. Use a mutex lock for thread safety in `create_account`, `deposit`, `withdraw`, and `check_balance`.\n3. Add a `find_account` function to locate accounts and reduce code duplication.\n4. Include validation in `deposit` and `withdraw` to ensure positive amounts and sufficient balance.\nWrite the entire code and no other text in the response.\n```c\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n\n#define MAX_ACCOUNTS 100\n\ntypedef struct {\n    int account_number;\n    char account_holder[100];\n    float balance;\n} Account;\n\nAccount accounts[MAX_ACCOUNTS];\nint account_count = 0;\npthread_mutex_t lock;\n\nvoid create_account(int account_number, const char* holder) {\n    accounts[account_count].account_number = account_number;\n    strcpy(accounts[account_count].accou

In [2]:
# read jsonl file
with open('../data/filenames.jsonl', 'r') as f:
    filenames = [json.loads(line) for line in f]

In [3]:
def parse_dataset_prompt(example):
    return parse_prompt(example["prompt"])
dataset_processed = dataset.map(parse_dataset_prompt, num_proc=10)

In [10]:
filenames_df = pd.DataFrame(filenames).set_index('id').assign(file_lang = lambda x: x.file_path.fillna('').apply(lambda y: y.split('.')[-1]))
filenames_df

Unnamed: 0_level_0,file_path,file_lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,src/banking/account_manager.c,c
1,src/banking/account_manager.c,c
2,src/banking/account_manager.c,c
3,src/ln_script.c,c
4,src/ln_script.c,c
...,...,...
321121,src/main/cpp/engine/AudioBuffer.hpp,hpp
323894,src/utils/file_reader.rs,rs
328559,src/main/resources/application.properties,properties
328637,src/audio/buffer.rs,rs


In [11]:
filenames_df.value_counts('file_lang').tail(20)

file_lang
mel                                                 2
bazel                                               1
cla                                                 1
cadence                                             1
f90                                                 1
dist-info                                           1
cfm                                                 1
asn                                                 1
log                                                 1
inf                                                 1
path/to/file                                        1
m4                                                  1
quantum_spin_systems/hyperfin<|fim_suffix|>         1
pm                                                  1
rsx                                                 1
src/main/java/com/example/MaximumXOROfTripletps     1
src/backends/cuda/kernels/nnfusion/kernels/cuda/    1
tpp                                                 1
thrift            

In [None]:
df = (
    dataset_processed.to_pandas()
    .join(filenames_df)
    # some code blocks are not real code blocks, but rather comments
    .pipe(lambda x : x[~x.code_block.str.contains(' Overall, ')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Score":')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Quality":')])
    .pipe(lambda x : x[~x.code_block.str.contains('Provide the code')])
    .pipe(lambda x : x[x.language != ''])
    .assign(
        file_lang = lambda x : x.file_lang.str
                .replace('rs', 'rust')
                .replace('py', 'python')
                .replace('kt', 'kotlin')
                .replace('js', 'javascript')
                .replace('jsx', 'javascript')
                .replace('cc', 'cpp')
                .replace('cu', 'cpp')
                .replace('hpp', 'cpp')
                .replace('hh', 'cpp') 
                # some js code is html in reality
                .replace('html', 'javascript')
                .replace('vue', 'javascript')
                .replace('')
        )
    )
df.head()

  .replace('')


Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
0,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,To improve the code:\n1. Replace float with do...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
1,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,Convert account balance from float to double. ...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
2,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,"Switch balance to double, add mutex locks, cre...",#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
3,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,"To improve the code, focus on adding null poin...",#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c
4,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,Add null pointer checks to all functions. Modi...,#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c


In [13]:
# many inconsistencies between completion language and real code block language
# We avoid flagging c/cpp and java/kotlin as different languages
failed_lang = df[(df.language != df.file_lang) & ((df.language != 'c') & (df.file_lang != 'cpp')) & ((df.language != 'kotlin') & (df.file_lang != 'java')) ]
print(failed_lang.shape)
failed_lang.value_counts('language')

(3293, 8)


language
cpp           2074
javascript     614
rust           205
java           161
go             149
python          75
objc            10
bash             2
html             2
typescript       1
Name: count, dtype: int64

In [14]:
failed_lang.sample(10)

Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
355871,Rewrite the given cpp program as per the follo...,"```cpp\nclass Step:\n def __init__(self, no...",Rewrite the given cpp program as per the follo...,Update the `play_chain` method in `PatternChai...,"class Step:\n def __init__(self, note, leng...",cpp,src/music/pattern_chainer.py,python
357171,Rewrite the given cpp program as per the follo...,```cpp\nimport random\nimport time\nimport log...,Rewrite the given cpp program as per the follo...,1. **Integrate Logging:**\n a. Import the `...,import random\nimport time\n\nclass NetworkExc...,cpp,src/network/simulator.py,python
368061,Rewrite the given go program as per the follow...,"```go\nclass Faction:\n def __init__(self, ...",Rewrite the given go program as per the follow...,1. Begin by identifying the areas in the `Fact...,"class Faction:\n def __init__(self, faction...",go,src/game/faction_war.py,python
356561,Rewrite the given cpp program as per the follo...,```cpp\nimport math\n\nclass Element3D:\n d...,Rewrite the given cpp program as per the follo...,Add a caching mechanism for neighbors in the `...,import math\n\nclass Element3D:\n def __ini...,cpp,src/physics/ElementManager.py,python
360288,Rewrite the given cpp program as per the follo...,```cpp\nclass MediaKeys:\n def __init__(sel...,Rewrite the given cpp program as per the follo...,1. Identify the classes in the original code t...,"class MediaKeys:\n def __init__(self, sessi...",cpp,src/media_encryption/decryption_manager.py,python
354690,Rewrite the given cpp program as per the follo...,```cpp\nimport numpy as np\nimport matplotlib....,Rewrite the given cpp program as per the follo...,Step-by-Step Detailed Plan for Code Transforma...,import numpy as np\nimport matplotlib.pyplot a...,cpp,audio_processing/spectrogram_analysis.py,python
362434,Rewrite the given javascript program as per th...,```javascript\n// Define the Book interface\ni...,Rewrite the given javascript program as per th...,Concise Plan:\n1. Initialize the Library class...,// Define the Book interface\ninterface Book {...,javascript,src/library/Library.ts,ts
361608,Rewrite the given javascript program as per th...,"```javascript\nimport { Component, Input, Outp...",Rewrite the given javascript program as per th...,1. Import the FormsModule from @angular/forms ...,"import { Component, Input, Output, EventEmitte...",javascript,src/app/components/task-list/task-list.compone...,ts
355155,Rewrite the given cpp program as per the follo...,```cpp\nimport threading\nimport time\nfrom ty...,Rewrite the given cpp program as per the follo...,1. Identify that the primary improvement area ...,import threading\nimport time\nfrom typing imp...,cpp,src/hardware/device_manager.py,python
375312,Rewrite the given java program as per the foll...,```java\nimport csv\nimport re\nimport os\n\nc...,Rewrite the given java program as per the foll...,Step-by-Step Plan for Code Improvement:\n\n1. ...,import csv\nimport re\n\nclass Role:\n def ...,java,src/main/python/user_management.py,python


In [15]:
final_df = (df[
    (df.language == df.file_lang) |
    ((df.language == 'c') & (df.file_lang == 'cpp')) |
    ((df.language == 'kotlin') & (df.file_lang == 'java'))]
    # filter out languages with less than 10 samples
    .pipe(lambda x : x[x.language.isin(x.value_counts('language').pipe(lambda y : y[y > 10]).index)])
    # replace c with cpp when file_lang is cpp
    .assign(language = lambda x : np.where((x.file_lang == 'cpp') & (x.language == 'c'), 'cpp', x.language))
    # replace kotlin with java when file_lang is java
    .assign(language = lambda x : np.where((x.file_lang == 'java') & (x.language == 'kotlin'), 'java', x.language))
    .assign(initial_part = lambda x : 'Rewrite the given ' + x.language + ' program as per the following instruction.')
    .drop(columns = ['file_lang', 'prompt'])
    .rename(columns={'initial_part' : 'system_prompt'})
)
print(final_df.shape)
print("Removed rows: ", dataset.num_rows - len(final_df))
final_df.to_parquet('../data/cleaned_dataset.parquet', index=False)
final_df.drop_duplicates(subset='code_block').value_counts('language')

(366639, 6)
Removed rows:  14484


language
cpp           18667
javascript    15549
go            15048
python        14805
rust          13925
java          13522
kotlin        12733
c             11713
Name: count, dtype: int64

In [42]:
import re

def get_most_inner_code_block(text: str) -> str | None:
    """
    Extracts the content of the "most inner" code block from a string.
    This is interpreted as the content of the last code block found in the string,
    which effectively captures the deepest nested block if nesting occurs.

    Args:
        text (str): The input string potentially containing code blocks.

    Returns:
        str | None: The content of the most inner code block, or None if no block is found.
    """
    # Regex to find any content between ``` and ```.
    # (?:[a-zA-Z0-9]+\n)? optionally matches a language identifier (e.g., 'python\n').
    # ([\s\S]*?) non-greedily matches any character (including newline)
    # until the next ```.
    pattern = r"```(?:[a-zA-Z0-9]+\n)?([\s\S]*?)```"
    
    # re.DOTALL is crucial to allow '.' to match newline characters.
    matches = re.findall(pattern, text, re.DOTALL)
    
    if matches:
        # The last match in a non-greedy search will correspond to the
        # most deeply nested or the last occurring block.
        return matches[-1].strip()
    return None


In [43]:
get_most_inner_code_block(final_df['completion'].iat[0])

'#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n\n#define MAX_ACCOUNTS 100\n\ntypedef struct {\n    int account_number;\n    char account_holder[100];\n    double balance;  // Changed to double to handle float precision issues\n} Account;\n\nAccount accounts[MAX_ACCOUNTS];\nint account_count = 0;\npthread_mutex_t lock;\n\nvoid create_account(int account_number, const char* holder) {\n    pthread_mutex_lock(&lock);  // Locking for thread safety\n    if (account_count < MAX_ACCOUNTS) {\n        accounts[account_count].account_number = account_number;\n        strcpy(accounts[account_count].account_holder, holder);\n        accounts[account_count].balance = 0.0;\n        account_count++;\n    }\n    pthread_mutex_unlock(&lock);  // Unlocking after operation\n}\n\nAccount* find_account(int account_number) {\n    for (int i = 0; i < account_count; i++) {\n        if (accounts[i].account_number == account_number) {\n            return &accounts[i];\n     

In [None]:
d = final_df.copy()
d['completion'] = d['completion'].str.replace('```c\n# All imports here\n', '')
# Remove all content after the last ```
d['completion'] = d['completion'].str.replace(r'(\n```)(?!.*\n```).*', r'\1', regex=True)
d.pipe(lambda x: x[x['completion'].str.count('```') == 3]).iloc[0]

completion       ```c\n#include <stdio.h>\n#include <stdlib.h>\...
system_prompt    Rewrite the given c program as per the followi...
instruction      To improve the temperature monitoring system, ...
code_block       #include <stdio.h>\n#include <stdlib.h>\n\n// ...
language                                                         c
file_path                        src/sensors/temperature_monitor.c
Name: 948, dtype: object

In [62]:
print(d.loc[5184, 'completion'])

```c
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>

// Structure to hold thread data
typedef struct {
  bool* belongs;
  int* counts;
  int* color;
  pthread_mutex_t* mutex;
  int size;
  float xmin;
  float xmax;
  float ymin;
  float ymax;
  int maxIterations;
  int i_begin;
  int i_end;
  int j_begin;
  int j_end;
} threadData;

// Global barrier
pthread_barrier_t barrier;

// Function to generate the Buddhabrot image
void* generate_set(void* data) {
  threadData* td = (threadData*) data;
  printf("Thread sub-image block: cols (%d, %d) to rows (%d, %d)\n", td->j_begin, td->j_end, td->i_begin, td->i_end);

  // Initialize variables
  bool* belongs = td->belongs;
  int* counts = td->counts;
  int* color = td->color;
  pthread_mutex_t* mutex = td->mutex;
  int size = td->size;
  float xmin = td->xmin;
  float xmax = td->xmax;
  float ymin = td->ymin;
  floa

In [48]:
get_most_inner_code_block(d.pipe(lambda x: x[x['completion'].str.count('```') == 3]).iloc[0]['completion'])

'# All imports here'

In [21]:
dataset_updated = load_dataset('parquet', data_files='../data/cleaned_dataset.parquet')
dataset_updated.push_to_hub("Vokturz/NextCoderDataset-parsed")

Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 139.10ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 139.32ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 129.79ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 134.19ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 125.20ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [01:05<00:00, 13.20s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed/commit/0ee169672628594be8d32e69e7211c6b8add8923', commit_message='Upload dataset', commit_description='', oid='0ee169672628594be8d32e69e7211c6b8add8923', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vokturz/NextCoderDataset-parsed'), pr_revision=None, pr_num=None)