In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset, Dataset
import sys
import numpy as np
import pandas as pd
import json

sys.path.append('../')
from formatter.utils.parsing import parse_prompt

dataset = load_dataset("microsoft/NextCoderDataset", split="train")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


{'prompt': 'Rewrite the given c program as per the following instruction.\nTo improve the code:\n1. Replace float with double for the account balance to handle precision issues.\n2. Use a mutex lock for thread safety in `create_account`, `deposit`, `withdraw`, and `check_balance`.\n3. Add a `find_account` function to locate accounts and reduce code duplication.\n4. Include validation in `deposit` and `withdraw` to ensure positive amounts and sufficient balance.\nWrite the entire code and no other text in the response.\n```c\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n\n#define MAX_ACCOUNTS 100\n\ntypedef struct {\n    int account_number;\n    char account_holder[100];\n    float balance;\n} Account;\n\nAccount accounts[MAX_ACCOUNTS];\nint account_count = 0;\npthread_mutex_t lock;\n\nvoid create_account(int account_number, const char* holder) {\n    accounts[account_count].account_number = account_number;\n    strcpy(accounts[account_count].accou

In [2]:
# read jsonl file
with open('../data/filenames.jsonl', 'r') as f:
    filenames = [json.loads(line) for line in f]

In [3]:
def parse_dataset_prompt(example):
    return parse_prompt(example["prompt"])
dataset_processed = dataset.map(parse_dataset_prompt, num_proc=10)

In [4]:
filenames_df = pd.DataFrame(filenames).set_index('id').assign(file_lang = lambda x: x.file_path.fillna('').apply(lambda y: y.split('.')[-1]))
filenames_df

Unnamed: 0_level_0,file_path,file_lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,src/banking/account_manager.c,c
1,src/banking/account_manager.c,c
2,src/banking/account_manager.c,c
3,src/ln_script.c,c
4,src/ln_script.c,c
...,...,...
321121,src/main/cpp/engine/AudioBuffer.hpp,hpp
323894,src/utils/file_reader.rs,rs
328559,src/main/resources/application.properties,properties
328637,src/audio/buffer.rs,rs


In [5]:
filenames_df.value_counts('file_lang').tail(20)

file_lang
mel                                                 2
bazel                                               1
cla                                                 1
cadence                                             1
f90                                                 1
dist-info                                           1
cfm                                                 1
asn                                                 1
log                                                 1
inf                                                 1
path/to/file                                        1
m4                                                  1
quantum_spin_systems/hyperfin<|fim_suffix|>         1
pm                                                  1
rsx                                                 1
src/main/java/com/example/MaximumXOROfTripletps     1
src/backends/cuda/kernels/nnfusion/kernels/cuda/    1
tpp                                                 1
thrift            

In [6]:
df = (
    dataset_processed.to_pandas()
    .join(filenames_df)
    # some code blocks are not real code blocks, but rather comments
    .pipe(lambda x : x[~x.code_block.str.contains(' Overall, ')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Score":')])
    .pipe(lambda x : x[~x.code_block.str.contains(' Quality":')])
    .pipe(lambda x : x[~x.code_block.str.contains('Provide the code')])
    .pipe(lambda x : x[x.language != ''])
    .assign(
        file_lang = lambda x : x.file_lang.str
                .replace('rs', 'rust')
                .replace('py', 'python')
                .replace('kt', 'kotlin')
                .replace('js', 'javascript')
                .replace('jsx', 'javascript')
                .replace('cc', 'cpp')
                .replace('cu', 'cpp')
                .replace('hpp', 'cpp')
                .replace('hh', 'cpp') 
                # some js code is html in reality
                .replace('html', 'javascript')
                .replace('vue', 'javascript')
                .replace('')
        )
    )
df.head()

  .replace('')


Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
0,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,To improve the code:\n1. Replace float with do...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
1,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,Convert account balance from float to double. ...,#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
2,Rewrite the given c program as per the followi...,```c\n#include <stdio.h>\n#include <stdlib.h>\...,Rewrite the given c program as per the followi...,"Switch balance to double, add mutex locks, cre...",#include <stdio.h>\n#include <stdlib.h>\n#incl...,c,src/banking/account_manager.c,c
3,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,"To improve the code, focus on adding null poin...",#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c
4,Rewrite the given c program as per the followi...,```c\n#include <inttypes.h>\n#include <stdbool...,Rewrite the given c program as per the followi...,Add null pointer checks to all functions. Modi...,#include <inttypes.h>\n#include <stdbool.h>\n#...,c,src/ln_script.c,c


In [7]:
# many inconsistencies between completion language and real code block language
# We avoid flagging c/cpp and java/kotlin as different languages
failed_lang = df[(df.language != df.file_lang) & ((df.language != 'c') & (df.file_lang != 'cpp')) & ((df.language != 'kotlin') & (df.file_lang != 'java')) ]
print(failed_lang.shape)
failed_lang.value_counts('language')

(3293, 8)


language
cpp           2074
javascript     614
rust           205
java           161
go             149
python          75
objc            10
bash             2
html             2
typescript       1
Name: count, dtype: int64

In [8]:
failed_lang.sample(10)

Unnamed: 0,prompt,completion,initial_part,instruction,code_block,language,file_path,file_lang
376169,Rewrite the given java program as per the foll...,```java\nimport struct\n\ndef serialize_object...,Rewrite the given java program as per the foll...,Refactor the serialization and deserialization...,import struct\n\ndef serialize_objects(objects...,java,src/utils/serialization.py,python
236423,Rewrite the given cpp program as per the follo...,"```cpp\n#include ""wdog.h""\n#include <include/d...",Rewrite the given cpp program as per the follo...,Make the watchdog timer initialization and upd...,"#include ""wdog.h""\n#include <include/derivativ...",cpp,drivers/src/wdog/wdog.c,c
181781,Rewrite the given javascript program as per th...,"```javascript\n""use strict"";\n\nvar __extends ...",Rewrite the given javascript program as per th...,Combine the click and keydown event listeners ...,"""use strict"";\n\nvar __extends = this && this....",javascript,src/components/ModalWrapper.tsx,tsx
358494,Rewrite the given cpp program as per the follo...,"```cpp\nimport json\nfrom typing import List, ...",Rewrite the given cpp program as per the follo...,To transform the original code into the edited...,"import json\nfrom typing import List, Dict\n\n...",cpp,src/services/email_service.py,python
179450,Rewrite the given javascript program as per th...,"```javascript\n""use strict"";\nvar __extends = ...",Rewrite the given javascript program as per th...,Make the code more efficient and maintainable ...,"""use strict"";\nvar __extends = (this && this._...",javascript,src/css/elements/textarea-element-container.ts,ts
363503,Rewrite the given javascript program as per th...,```javascript\n-- SQL script to create the req...,Rewrite the given javascript program as per th...,Make sure the `username` and `email` columns i...,-- SQL script to create the required tables an...,javascript,src/database/migrations/001_initial_setup.sql,sql
355972,Rewrite the given cpp program as per the follo...,```cpp\ndef repeated_substring_pattern(s):\n ...,Rewrite the given cpp program as per the follo...,1. Add a condition to handle single character ...,def repeated_substring_pattern(s):\n # Func...,cpp,algorithms/string/repeated_substring_pattern.py,python
358792,Rewrite the given cpp program as per the follo...,```cpp\nimport numpy as np\n\nclass MetricMana...,Rewrite the given cpp program as per the follo...,1. Import numpy at the start of the script.\n2...,class MetricManager:\n def __init__(self):\...,cpp,src/metrics/MetricManager.py,python
359626,Rewrite the given cpp program as per the follo...,```cpp\nclass DNAAnalyzer:\n def __init__(s...,Rewrite the given cpp program as per the follo...,1. Use `Counter` from `collections` to simplif...,"class DNAAnalyzer:\n def __init__(self, seq...",cpp,src/bioinformatics/dna_analyzer.py,python
355295,Rewrite the given cpp program as per the follo...,```cpp\nimport numpy as np\nfrom collections i...,Rewrite the given cpp program as per the follo...,Import `heapq` for efficient polygon reduction...,import numpy as np\nfrom collections import de...,cpp,src/core/mesh_processing.py,python


In [16]:
final_df = (df[
    (df.language == df.file_lang) |
    ((df.language == 'c') & (df.file_lang == 'cpp')) |
    ((df.language == 'kotlin') & (df.file_lang == 'java'))]
    # filter out languages with less than 10 samples
    .pipe(lambda x : x[x.language.isin(x.value_counts('language').pipe(lambda y : y[y > 10]).index)])
    # replace c with cpp when file_lang is cpp
    .assign(language = lambda x : np.where((x.file_lang == 'cpp') & (x.language == 'c'), 'cpp', x.language))
    # replace kotlin with java when file_lang is java
    .assign(language = lambda x : np.where((x.file_lang == 'java') & (x.language == 'kotlin'), 'java', x.language))
    .assign(initial_part = lambda x : 'Rewrite the given ' + x.language + ' program as per the following instruction.')
    .drop(columns = ['file_lang', 'prompt'])
    .rename(columns={'initial_part' : 'system_prompt'})
)
print(final_df.shape)
print("Removed rows: ", dataset.num_rows - len(final_df))
final_df.to_parquet('../data/cleaned_dataset.parquet', index=False)
final_df.drop_duplicates(subset='code_block').value_counts('language')

(366639, 6)
Removed rows:  14484


language
cpp           18667
javascript    15549
go            15048
python        14805
rust          13925
java          13522
kotlin        12733
c             11713
Name: count, dtype: int64

In [21]:
dataset_updated = load_dataset('parquet', data_files='../data/cleaned_dataset.parquet')
dataset_updated.push_to_hub("Vokturz/NextCoderDataset-parsed")

Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 139.10ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 139.32ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 129.79ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 134.19ba/s]
Creating parquet from Arrow format: 100%|██████████| 74/74 [00:00<00:00, 125.20ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [01:05<00:00, 13.20s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed/commit/0ee169672628594be8d32e69e7211c6b8add8923', commit_message='Upload dataset', commit_description='', oid='0ee169672628594be8d32e69e7211c6b8add8923', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vokturz/NextCoderDataset-parsed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vokturz/NextCoderDataset-parsed'), pr_revision=None, pr_num=None)