In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset, Dataset
import sys
import numpy as np
import pandas as pd
import json

sys.path.append('../')
from formatter.utils.parsing import parse_prompt, parse_code_block

dataset = load_dataset("microsoft/NextCoderDataset", split="train")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


{'prompt': 'Rewrite the given c program as per the following instruction.\nTo improve the code:\n1. Replace float with double for the account balance to handle precision issues.\n2. Use a mutex lock for thread safety in `create_account`, `deposit`, `withdraw`, and `check_balance`.\n3. Add a `find_account` function to locate accounts and reduce code duplication.\n4. Include validation in `deposit` and `withdraw` to ensure positive amounts and sufficient balance.\nWrite the entire code and no other text in the response.\n```c\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n\n#define MAX_ACCOUNTS 100\n\ntypedef struct {\n    int account_number;\n    char account_holder[100];\n    float balance;\n} Account;\n\nAccount accounts[MAX_ACCOUNTS];\nint account_count = 0;\npthread_mutex_t lock;\n\nvoid create_account(int account_number, const char* holder) {\n    accounts[account_count].account_number = account_number;\n    strcpy(accounts[account_count].accou

In [2]:
# read jsonl file
with open('../data/filenames.jsonl', 'r') as f:
    filenames = [json.loads(line) for line in f]

In [3]:
def parse_dataset_prompt(example):
    return parse_prompt(example["prompt"])
dataset_processed = dataset.map(parse_dataset_prompt, num_proc=10)

In [4]:
filenames_df = pd.DataFrame(filenames).set_index('id').assign(file_lang = lambda x: x.file_path.fillna('').apply(lambda y: y.split('.')[-1]))
filenames_df

Unnamed: 0_level_0,file_path,file_lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,src/banking/account_manager.c,c
1,src/banking/account_manager.c,c
2,src/banking/account_manager.c,c
3,src/ln_script.c,c
4,src/ln_script.c,c
...,...,...
321121,src/main/cpp/engine/AudioBuffer.hpp,hpp
323894,src/utils/file_reader.rs,rs
328559,src/main/resources/application.properties,properties
328637,src/audio/buffer.rs,rs


In [5]:
filenames_df.value_counts('file_lang').tail(20)

file_lang
mel                                                 2
bazel                                               1
cla                                                 1
cadence                                             1
f90                                                 1
dist-info                                           1
cfm                                                 1
asn                                                 1
log                                                 1
inf                                                 1
path/to/file                                        1
m4                                                  1
quantum_spin_systems/hyperfin<|fim_suffix|>         1
pm                                                  1
rsx                                                 1
src/main/java/com/example/MaximumXOROfTripletps     1
src/backends/cuda/kernels/nnfusion/kernels/cuda/    1
tpp                                                 1
thrift            

In [6]:
dataset_df = dataset_processed.to_pandas()
dataset_df['completion'] = dataset_df['completion'].str.replace('```c\n# All imports here\n', '', regex=False)
dataset_df.completion.str.count('```').value_counts()

completion
2     368325
3       9360
4       1989
1        744
5        624
6         42
8         18
7          6
10         6
27         3
9          3
12         3
Name: count, dtype: int64

In [7]:
dataset_df.pipe(lambda x :
    x[x.code_block.str.contains(' Overall, ') 
    | x.code_block.str.contains(' Score":')
    | x.code_block.str.contains(' Quality":')
    | x.code_block.str.contains('Provide the code')]
    ).shape

(4108, 6)

In [8]:
df = (
    dataset_df
    .join(filenames_df)
    # some code blocks are not real code blocks, but rather comments
    .pipe(lambda x : x[~x.code_block.str.contains(' Overall, ')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Score":')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Quality":')])
    .pipe(lambda x : x[~x.code_block.str.contains('Provide the code')])
    .pipe(lambda x : x[x.language != ''])
    # some completions have more than one code block
    .pipe(lambda x : x[x.completion.str.count('```') == 2])
    .assign(
        file_lang = lambda x : x.file_lang.str
                .replace('rs', 'rust')
                .replace('py', 'python')
                .replace('kt', 'kotlin')
                .replace('js', 'javascript')
                .replace('jsx', 'javascript')
                .replace('cc', 'cpp')
                .replace('cu', 'cpp')
                .replace('hpp', 'cpp')
                .replace('hh', 'cpp') 
                # some js code is html in reality
                .replace('html', 'javascript')
                .replace('vue', 'javascript')
                .replace('')
        )
    )
df.head()

  .replace('')


Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
0,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,To improve the code:\n1. Replace float with do...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
1,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,Convert account balance from float to double. ...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
2,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,"Switch balance to double, add mutex locks, cre...",#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
3,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,"To improve the code, focus on adding null poin...",#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c
4,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,Add null pointer checks to all functions. Modi...,#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c


In [9]:
# many inconsistencies between completion language and real code block language
# We avoid flagging c/cpp and java/kotlin as different languages
failed_lang = df[(df.language != df.file_lang) & ((df.language != 'c') & (df.file_lang != 'cpp')) & ((df.language != 'kotlin') & (df.file_lang != 'java')) ]
print(failed_lang.shape)
failed_lang.value_counts('language')

(3247, 8)


language
cpp           2050
javascript     597
rust           203
java           158
go             149
python          75
objc            10
bash             2
html             2
typescript       1
Name: count, dtype: int64

In [10]:
failed_lang.sample(10)

Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
258684,Rewrite the given cpp program as per the follo...,```cpp\n// RobustOptSO3.h\n#pragma once\n\n#in...,Rewrite the given cpp program as per the follo...,To transform the original code into the edited...,// RobustOptSO3.h\n #pragma once\n\n #includ...,cpp,src/math/RobustOptSO3.h,h
360696,Rewrite the given cpp program as per the follo...,```cpp\n#ifndef FUNCTIONAL_UTILS_H\n#define FU...,Rewrite the given cpp program as per the follo...,1. Begin by including the <type_traits> header...,#ifndef FUNCTIONAL_UTILS_H\n#define FUNCTIONAL...,cpp,src/utils/functional_utils.h,h
355736,Rewrite the given cpp program as per the follo...,"```cpp\nfrom typing import List, Tuple\n\ndef ...",Rewrite the given cpp program as per the follo...,"Human-like prompt:\n""Can you change the `bucke...","from typing import List, Tuple\n\ndef findMaxi...",cpp,algorithms/math/gap_finder.py,python
326283,Rewrite the given rust program as per the foll...,```rust\nuse std::env;\nuse std::path::Path;\n...,Rewrite the given rust program as per the foll...,"To transform the original code, follow these s...",use std::env;\nuse std::path::Path;\nuse cc::B...,rust,Cargo.toml,toml
328094,Rewrite the given rust program as per the foll...,```rust\nrequire 'ripper'\nrequire 'ripper/sex...,Rewrite the given rust program as per the foll...,Make the RubyParser more efficient by using Ri...,# ruby\n\nrequire 'ripper'\nrequire 'ripper/se...,rust,lib/ruby_parser.rb,rb
251551,Rewrite the given cpp program as per the follo...,"```cpp\n#include ""Python.h""\n#include ""datadog...",Rewrite the given cpp program as per the follo...,Transform the code by:\n\n* Adding necessary h...,"#include ""Python.h""\n#include ""datadog_agent.h...",cpp,src/agent/datadog_agent.c,c
273436,Rewrite the given javascript program as per th...,"```javascript\ndescribe('Reese Admin App', fun...",Rewrite the given javascript program as per th...,"To improve the code, follow these concise step...","describe('Reese Admin App', function () {\n\n ...",javascript,e2e/app.e2e-spec.ts,ts
359289,Rewrite the given cpp program as per the follo...,```cpp\nimport sys\nimport os\nimport io\n\ncl...,Rewrite the given cpp program as per the follo...,1. **Initialization and Input Handling**:\n ...,import sys\nimport os\nimport io\n\nclass Inte...,cpp,src/interpreter/Interpreter.py,python
360158,Rewrite the given cpp program as per the follo...,"```cpp\nclass Product:\n def __init__(self,...",Rewrite the given cpp program as per the follo...,"Human-Like Prompt:\n""Convert the `Store` class...","class Product:\n def __init__(self, id, nam...",cpp,src/business/store.py,python
356385,Rewrite the given cpp program as per the follo...,```cpp\nimport librosa\nimport numpy as np\n\n...,Rewrite the given cpp program as per the follo...,1. Load the audio file using `librosa.load`.\n...,import librosa\nimport numpy as np\n\ndef extr...,cpp,audio_analysis/lib/audio_feature_extractor.py,python


In [11]:
final_df = (df[
    (df.language == df.file_lang) |
    ((df.language == 'c') & (df.file_lang == 'cpp')) |
    ((df.language == 'kotlin') & (df.file_lang == 'java'))]
    # filter out languages with less than 10 samples
    .pipe(lambda x : x[x.language.isin(x.value_counts('language').pipe(lambda y : y[y > 10]).index)])
    # replace c with cpp when file_lang is cpp
    .assign(language = lambda x : np.where((x.file_lang == 'cpp') & (x.language == 'c'), 'cpp', x.language))
    # replace kotlin with java when file_lang is java
    .assign(language = lambda x : np.where((x.file_lang == 'java') & (x.language == 'kotlin'), 'java', x.language))
    .assign(initial_part = lambda x : 'Rewrite the given ' + x.language + ' program as per the following instruction.')
    .assign(
        completion_code_block = lambda x : x.completion.apply(lambda y : parse_code_block(y)['code_block'])
    )
    .drop(columns = ['file_lang', 'prompt', 'completion'])
    .rename(columns={'initial_part' : 'system_prompt'})
    [['language', 'code_block', 'file_path', 'system_prompt','instruction', 'completion_code_block']]
)
print(final_df.shape)
print("Removed rows: ", dataset.num_rows - len(final_df))
final_df.to_parquet('../data/cleaned_dataset.parquet', index=False)
final_df.drop_duplicates(subset='code_block').value_counts('language')

(354118, 6)
Removed rows:  27005


language
cpp           17415
javascript    14753
python        14738
go            14359
rust          13552
java          13433
kotlin        12388
c             11345
Name: count, dtype: int64

In [12]:
dataset_updated = load_dataset('parquet', data_files='../data/cleaned_dataset.parquet')
dataset_updated.push_to_hub("Vokturz/NextCoderDataset-parsed")

Generating train split: 354118 examples [00:01, 300080.34 examples/s]
Creating parquet from Arrow format: 100%|██████████| 71/71 [00:00<00:00, 231.94ba/s]
Creating parquet from Arrow format: 100%|██████████| 71/71 [00:00<00:00, 141.75ba/s]
Creating parquet from Arrow format: 100%|██████████| 71/71 [00:00<00:00, 133.55ba/s]
Creating parquet from Arrow format: 100%|██████████| 71/71 [00:00<00:00, 128.52ba/s]
Creating parquet from Arrow format: 100%|██████████| 71/71 [00:00<00:00, 117.93ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [00:49<00:00,  9.89s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed/commit/1c8fdb22772e9d8c71178fbe75c3d69bf09a37e1', commit_message='Upload dataset', commit_description='', oid='1c8fdb22772e9d8c71178fbe75c3d69bf09a37e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vokturz/NextCoderDataset-parsed'), pr_revision=None, pr_num=None)