In [7]:
from datasets import load_dataset
import pandas as pd
 

dataset = load_dataset(
    "Rtian/DebugBench",
    split="test"
)

dataset

Dataset({
    features: ['language', 'constraints', 'solution_explanation', 'solution', 'level', 'slug', 'examples', 'buggy_code', 'bug_explanation', 'release_time', 'subtype', 'category', 'question'],
    num_rows: 4253
})

In [18]:
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Text splitter
chunk_size = 300
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

# Chunk a sample section
sample_section = next(iter(dataset))
chunks = text_splitter.create_documents(
    texts=[sample_section["question"]], 
    metadatas=[{"solution": sample_section["solution"]}])
num_chunks = len(chunks)
print(f"{num_chunks} chunks")
print (chunks)


4 chunks
[Document(page_content='An image is represented by an m x n integer grid image where image[i][j] represents the pixel value of the image.\nYou are also given three integers sr, sc, and color. You should perform a flood fill on the image starting from the pixel image[sr][sc].', metadata={'solution': "class Solution {\npublic:\n    void bfs(int prevcolor,int row,int col,int color,vector<vector<int>>&visited,vector<vector<int>>&image)\n    {\n        // it'll ease the work\n        queue<pair<int,int>> q;\n        q.push({row,col});\n        visited[row][col]=1;\n\n        while(!q.empty())\n        {\n            row = q.front().first;\n            col = q.front().second;\n            q.pop();\n\n            // if img is of the starting color.\n            if(image[row][col]==prevcolor) image[row][col]=color;\n\n            // up\n            if(row-1>=0 && !visited[row-1][col] && image[row-1][col]==prevcolor) q.push({row-1,col}),visited[row-1][col]=1, image[row-1][col]=color;\n

In [28]:
from functools import partial
def chunk_section(section, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len)
    chunks = text_splitter.create_documents(
        texts=[sample_section["question"]], 
        metadatas=[{"solution": sample_section["solution"]}])
    return {
        "question_chunk": [chunk.page_content for chunk in chunks],
        "solution_chunk": [chunk.metadata["solution"] for chunk in chunks]
    }
# Scale chunking

chunks_ds = dataset.map(partial(
    chunk_section, 
    chunk_size=300, 
    chunk_overlap=40))
    
# Get the number of chunks
num_chunks = len(chunks_ds)
#chunks_ds[:1]["question"] 
#dataset[:1]["question"]
#print(f"{num_chunks} chunks")
#print(chunks_ds[0])
splitted = chunks_ds.map(lambda data: {"question": data["question"], "solution": data["solution"]}, batched=True)
# Convert to Pandas DataFrame
splitted = splitted.remove_columns(list(set(splitted.column_names) - {"question", "solution"}))
size = len(splitted)
print(f"{size} chunks")
print(splitted[0])


4253 chunks
{'solution': "class Solution {\npublic:\n    void bfs(int prevcolor,int row,int col,int color,vector<vector<int>>&visited,vector<vector<int>>&image)\n    {\n        // it'll ease the work\n        queue<pair<int,int>> q;\n        q.push({row,col});\n        visited[row][col]=1;\n\n        while(!q.empty())\n        {\n            row = q.front().first;\n            col = q.front().second;\n            q.pop();\n\n            // if img is of the starting color.\n            if(image[row][col]==prevcolor) image[row][col]=color;\n\n            // up\n            if(row-1>=0 && !visited[row-1][col] && image[row-1][col]==prevcolor) q.push({row-1,col}),visited[row-1][col]=1, image[row-1][col]=color;\n\n            // right\n            if(col+1<image[0].size() &&  !visited[row][col+1] && image[row][col+1]==prevcolor) q.push({row,col+1}),visited[row][col+1]=1, image[row][col+1]=color;\n\n            //down\n            if(row+1<image.size() && !visited[row+1][col] && image[row+1][

In [30]:
df = pd.DataFrame(splitted)


In [32]:


# Save the DataFrame as a .jsonl file
df.to_json('data.jsonl', orient='records', lines=True)