In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset, Dataset
import sys
import numpy as np
import pandas as pd
import json

sys.path.append('../')
from formatter.utils.parsing import parse_prompt, parse_code_block
from formatter.utils.code_validator import validate_code

dataset = load_dataset("microsoft/NextCoderDataset", split="train")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


{'prompt': 'Rewrite the given c program as per the following instruction.\nTo improve the code:\n1. Replace float with double for the account balance to handle precision issues.\n2. Use a mutex lock for thread safety in `create_account`, `deposit`, `withdraw`, and `check_balance`.\n3. Add a `find_account` function to locate accounts and reduce code duplication.\n4. Include validation in `deposit` and `withdraw` to ensure positive amounts and sufficient balance.\nWrite the entire code and no other text in the response.\n```c\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n\n#define MAX_ACCOUNTS 100\n\ntypedef struct {\n    int account_number;\n    char account_holder[100];\n    float balance;\n} Account;\n\nAccount accounts[MAX_ACCOUNTS];\nint account_count = 0;\npthread_mutex_t lock;\n\nvoid create_account(int account_number, const char* holder) {\n    accounts[account_count].account_number = account_number;\n    strcpy(accounts[account_count].accou

In [2]:
# read jsonl file
with open('../data/filenames.jsonl', 'r') as f:
    filenames = [json.loads(line) for line in f]

In [3]:
def parse_dataset_prompt(example):
    return parse_prompt(example["prompt"])

def validate_code_block(example):
    return validate_code(example['language'], example['code_block'])

dataset_processed = dataset.map(parse_dataset_prompt, num_proc=10)
dataset_processed = dataset_processed.map(validate_code_block, num_proc=10)

In [4]:
filenames_df = pd.DataFrame(filenames).set_index('id').assign(file_lang = lambda x: x.file_path.fillna('').apply(lambda y: y.split('.')[-1]))
filenames_df

Unnamed: 0_level_0,file_path,file_lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,src/banking/account_manager.c,c
1,src/banking/account_manager.c,c
2,src/banking/account_manager.c,c
3,src/ln_script.c,c
4,src/ln_script.c,c
...,...,...
321121,src/main/cpp/engine/AudioBuffer.hpp,hpp
323894,src/utils/file_reader.rs,rs
328559,src/main/resources/application.properties,properties
328637,src/audio/buffer.rs,rs


In [5]:
filenames_df.value_counts('file_lang').tail(20)

file_lang
mel                                                 2
bazel                                               1
cla                                                 1
cadence                                             1
f90                                                 1
dist-info                                           1
cfm                                                 1
asn                                                 1
log                                                 1
inf                                                 1
path/to/file                                        1
m4                                                  1
quantum_spin_systems/hyperfin<|fim_suffix|>         1
pm                                                  1
rsx                                                 1
src/main/java/com/example/MaximumXOROfTripletps     1
src/backends/cuda/kernels/nnfusion/kernels/cuda/    1
tpp                                                 1
thrift            

In [6]:
dataset_df = dataset_processed.to_pandas()
dataset_df['completion'] = dataset_df['completion'].str.replace('```c\n# All imports here\n', '', regex=False)
dataset_df['completion'] = dataset_df['completion'].str.replace('// Include .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('// Import .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('// All .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('// Importing .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('// Necessary .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('// No .+\n', '', regex=True)

dataset_df['completion'] = dataset_df['completion'].str.replace('# Import .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('# All .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('# Importing .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('# Include .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('# Necessary .+\n', '', regex=True)
dataset_df['completion'] = dataset_df['completion'].str.replace('# No .+\n', '', regex=True)

# Some completions have more than one code block
dataset_df.completion.str.count('```').value_counts()

completion
2     368325
3       9360
4       1989
1        744
5        624
6         42
8         18
7          6
10         6
27         3
9          3
12         3
Name: count, dtype: int64

In [7]:
dataset_df.pipe(lambda x :
    x[x.code_block.str.contains(' Overall, ') 
    | x.code_block.str.contains(' Score":')
    | x.code_block.str.contains(' Quality":')
    | x.code_block.str.contains('Provide the code')]
    ).shape

(4108, 7)

In [8]:
df = (
    dataset_df
    .join(filenames_df)
    # some code blocks are not real code blocks, but rather comments
    .pipe(lambda x : x[~x.code_block.str.contains(' Overall, ')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Score":')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Quality":')])
    .pipe(lambda x : x[~x.code_block.str.contains('Provide the code')])
    .pipe(lambda x : x[x.language != ''])
    # some completions have more than one code block
    .pipe(lambda x : x[x.completion.str.count('```') == 2])
    .assign(
        file_lang = lambda x : x.file_lang.str
                .replace('rs', 'rust')
                .replace('py', 'python')
                .replace('kt', 'kotlin')
                .replace('js', 'javascript')
                .replace('jsx', 'javascript')
                .replace('cc', 'cpp')
                .replace('cu', 'cpp')
                .replace('hpp', 'cpp')
                .replace('hh', 'cpp') 
                # some js code is html in reality
                .replace('html', 'javascript')
                .replace('vue', 'javascript')
                .replace('')
        )
    )
df.head()

  .replace('')


Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,valid,file_path,file_lang
0,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,To improve the code:\n1. Replace float with do...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,True,src/banking/account_manager.c,c
1,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,Convert account balance from float to double. ...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,True,src/banking/account_manager.c,c
2,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,"Switch balance to double, add mutex locks, cre...",#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,True,src/banking/account_manager.c,c
3,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,"To improve the code, focus on adding null poin...",#include <inttypes.h>\n#include <stdbool.h>\n#...,c,False,src/ln_script.c,c
4,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,Add null pointer checks to all functions. Modi...,#include <inttypes.h>\n#include <stdbool.h>\n#...,c,False,src/ln_script.c,c


In [9]:
# many inconsistencies between completion language and real code block language
# We avoid flagging c/cpp and java/kotlin as different languages
failed_lang = df[(df.language != df.file_lang) & ((df.language != 'c') & (df.file_lang != 'cpp')) & ((df.language != 'kotlin') & (df.file_lang != 'java')) ]
print(failed_lang.shape)
failed_lang.value_counts('language')

(3247, 9)


language
cpp           2050
javascript     597
rust           203
java           158
go             149
python          75
objc            10
bash             2
html             2
typescript       1
Name: count, dtype: int64

In [10]:
failed_lang.sample(10)

Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,valid,file_path,file_lang
364303,Rewrite the given javascript program as per th...,```javascript\nimport matplotlib.pyplot as plt...,Rewrite the given javascript program as per th...,1. Remove the unused `time` import.\n2. Replac...,import matplotlib.pyplot as plt\nimport time\n...,javascript,False,algorithms/geometry/convex_hull/gift_wrapping.py,python
213920,Rewrite the given javascript program as per th...,"```javascript\nimport { lt, mul } from 'biggys...",Rewrite the given javascript program as per th...,Make the code more efficient by using Promise....,// Import necessary modules and types\nimport ...,javascript,False,src/plugins/swap/multi-provider.ts,ts
363268,Rewrite the given javascript program as per th...,```javascript\nclass User:\n def __init__(s...,Rewrite the given javascript program as per th...,"To transform the original code, follow these s...","class User:\n def __init__(self, username):...",javascript,False,src/security/rbac.py,python
357409,Rewrite the given cpp program as per the follo...,```cpp\nimport json\nfrom enum import Enum\nfr...,Rewrite the given cpp program as per the follo...,1. Modify the `trigger_event` method in the `N...,import json\nfrom enum import Enum\nfrom datet...,cpp,False,src/network/core.py,python
282355,Rewrite the given go program as per the follow...,"```go\npackage ecc\n\nimport (\n\t""math/big""\n...",Rewrite the given go program as per the follow...,"To improve the code, focus on the following ke...","package ecc\n\nimport (\n\t""math/big""\n\t""cryp...",go,True,ecc/src/ec_point.rs,rust
349845,Rewrite the given c program as per the followi...,```c\n//\n// Note.h\n// NoteTakingApp\n//\n/...,Rewrite the given c program as per the followi...,To transform the original code to the edited c...,- (void)encodeWithCoder:(NSCoder *)coder {\n ...,objc,False,src/models/Article.m,m
275043,Rewrite the given javascript program as per th...,"```javascript\nimport { Directive, TemplateRef...",Rewrite the given javascript program as per th...,"To transform the original code, follow these s...","import { Directive, TemplateRef, ComponentFact...",javascript,False,src/app/directives/dynamic-menu.directive.ts,ts
360243,Rewrite the given cpp program as per the follo...,"```cpp\nimport re\nfrom typing import List, Di...",Rewrite the given cpp program as per the follo...,Step-by-Step Plan for Code Transformation:\n\n...,"import re\nfrom typing import List, Dict\n\n# ...",cpp,False,src/utils/device_filter.py,python
376943,Rewrite the given rust program as per the foll...,```rust\nfrom typing import List\n\ndef transf...,Rewrite the given rust program as per the foll...,"Human-Like Prompt (Ins3):\n""Hey, can you updat...",from typing import List\n\ndef transform_seque...,rust,False,src/algorithms/sequence_transformer.py,python
365864,Rewrite the given javascript program as per th...,```javascript\nimport datetime\n\nclass Diagno...,Rewrite the given javascript program as per th...,"Human-Like Prompt:\n""Combine the repeated code...",import datetime\n\nclass DiagnosticsEventData:...,javascript,False,src/diagnostics/logger.py,python


In [11]:
processed_df = (df[
    (df.language == df.file_lang) |
    ((df.language == 'c') & (df.file_lang == 'cpp')) |
    ((df.language == 'kotlin') & (df.file_lang == 'java'))]
    # filter out languages with less than 10 samples
    .pipe(lambda x : x[x.language.isin(x.value_counts('language').pipe(lambda y : y[y > 10]).index)])
    # replace c with cpp when file_lang is cpp
    .assign(language = lambda x : np.where((x.file_lang == 'cpp') & (x.language == 'c'), 'cpp', x.language))
    # replace kotlin with java when file_lang is java
    .assign(language = lambda x : np.where((x.file_lang == 'java') & (x.language == 'kotlin'), 'java', x.language))
    .assign(initial_part = lambda x : 'Rewrite the given ' + x.language + ' program as per the following instruction.')
    .assign(
        completion_code_block = lambda x : x.completion.apply(lambda y : parse_code_block(y)['code_block'])
    )
    .drop(columns = ['file_lang', 'prompt', 'completion'])
    .rename(columns={'initial_part' : 'system_prompt', 'valid' : 'is_valid'})
    [['language', 'code_block', 'file_path', 'system_prompt','instruction', 'completion_code_block', 'is_valid']]
)
print(processed_df.shape)


(354118, 7)


In [12]:
# Some code_blocks are not valid language according to tree-sitter 
processed_df.groupby('language').is_valid.value_counts().unstack()

is_valid,False,True
language,Unnamed: 1_level_1,Unnamed: 2_level_1
c,3908,30968
cpp,7423,45612
go,8184,34742
java,3144,45291
javascript,2255,42682
kotlin,2896,35076
python,1968,43120
rust,1182,45667


In [13]:
partial_final_df = processed_df.query('is_valid == True')
print("Removed rows: ", dataset.num_rows - len(partial_final_df))
partial_final_df.to_parquet('../data/partially_cleaned_dataset.parquet', index=False)
partial_final_df.drop_duplicates(subset='code_block').value_counts('language')

Removed rows:  57965


language
cpp           14882
python        14070
javascript    13984
rust          13147
java          12377
go            11586
kotlin        11408
c             10027
Name: count, dtype: int64

In [15]:
dataset_updated = load_dataset('parquet', data_files='../data/partially_cleaned_dataset.parquet')

# This is faster using datasets than pandas
def validate_completion_code_block(example):
    return validate_code(example['language'], example['completion_code_block'])

dataset_updated = dataset_updated.map(validate_completion_code_block, num_proc=10)
dataset_updated_filtered = dataset_updated.filter(lambda x: x["valid"] == True).remove_columns(["valid", "is_valid"])

final_df = dataset_updated_filtered['train'].to_pandas()
print("Removed rows: ", dataset.num_rows - len(final_df))
final_df.to_parquet('../data/cleaned_dataset.parquet', index=False)
final_df.drop_duplicates(subset='code_block').value_counts('language')

Removed rows:  67038


language
cpp           14621
python        13969
javascript    13541
rust          13010
java          12129
go            11269
kotlin        10738
c              9711
Name: count, dtype: int64

In [24]:
dataset_updated_filtered.push_to_hub("Vokturz/NextCoderDataset-parsed")

Creating parquet from Arrow format: 100%|██████████| 63/63 [00:01<00:00, 42.20ba/s]
Creating parquet from Arrow format: 100%|██████████| 63/63 [00:01<00:00, 34.89ba/s]
Creating parquet from Arrow format: 100%|██████████| 63/63 [00:01<00:00, 37.98ba/s]
Creating parquet from Arrow format: 100%|██████████| 63/63 [00:01<00:00, 34.93ba/s]
Creating parquet from Arrow format: 100%|██████████| 63/63 [00:01<00:00, 33.26ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [00:46<00:00,  9.36s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed/commit/15102f5d4b75a7b109c54699b4fde154c6158ca8', commit_message='Upload dataset', commit_description='', oid='15102f5d4b75a7b109c54699b4fde154c6158ca8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vokturz/NextCoderDataset-parsed'), pr_revision=None, pr_num=None)