In [11]:
import json

In [3]:
def build_dataset(file_path):
    with open(file_path) as file:
        json_data = json.load(file)
        data = json_data["data"][0]["paragraphs"]
        
    dataset = []
    for page in data:
        context = page["context"]
        for qa in page["qas"]:
            question = qa["question"]
            for ans in qa["answers"]:
                answer = ans["text"]
                dataset.append({
                    "context": context,
                    "question": question,
                    "answer": answer
                })
    return dataset

In [7]:
train_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/train_dataset.json"
validation_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/validation_dataset.json"
test_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/test_dataset.json"


train_dataset = build_dataset(file_path=train_dataset_file_path)
validation_dataset = build_dataset(file_path=validation_dataset_file_path)
test_dataset = build_dataset(file_path=test_dataset_file_path)


print(f"len(train_dataset): {len(train_dataset)}")
print(f"len(validation_dataset): {len(validation_dataset)}")
print(f"len(test_dataset): {len(test_dataset)}")

len(train_dataset): 854
len(validation_dataset): 107
len(test_dataset): 107


In [8]:
# writing to files

def save_dataset(file_path, dataset):
    try:
        paths = file_path.split(".")
        file_path = paths[0] + "_processed." + paths[1]
        with open(file_path, "w") as outfile:
            outfile.write(json.dumps(dataset, indent=4))
        print(f"file saved to: {file_path}")
    except Exception as e:
        print(f"Error: {e}")
        
save_dataset(file_path=train_dataset_file_path, dataset=train_dataset)
save_dataset(file_path=validation_dataset_file_path, dataset=validation_dataset)
save_dataset(file_path=test_dataset_file_path, dataset=test_dataset)

file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/train_dataset_processed.json
file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/validation_dataset_processed.json
file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/test_dataset_processed.json


In [9]:
def process_dataset(file_path):
    with open(file_path) as file:
        data = json.load(file)
        
    data_processed = []
    for item in data:
        system_message = """You are an expert of the Rust language. Please answer question based on the context below.
    Context: {context}""".format(context=item["context"])
        user_message = """Question: {question}""".format(question=item["question"])
        messages = [
            {"role": "system", "content": system_message}, 
            {"role": "user", "content": user_message}, 
            {"role": "assistant", "content": item["answer"]}
        ]
        data_processed.append({"messages": messages})
    
    return data_processed

In [12]:
# process and save dataset 

train_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/train_dataset_processed.json"
validation_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/validation_dataset_processed.json"
test_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/test_dataset_processed.json"


train_dataset = process_dataset(file_path=train_dataset_file_path)
validation_dataset = process_dataset(file_path=validation_dataset_file_path)
test_dataset = process_dataset(file_path=test_dataset_file_path)

save_dataset(file_path=train_dataset_file_path, dataset=train_dataset)
save_dataset(file_path=validation_dataset_file_path, dataset=validation_dataset)
save_dataset(file_path=test_dataset_file_path, dataset=test_dataset)

file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/train_dataset_processed_processed.json
file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/validation_dataset_processed_processed.json
file saved to: /home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/test_dataset_processed_processed.json
