In [2]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face Hub
dataset_name = "iamtarun/code_instructions_120k_alpaca"
dataset = load_dataset(dataset_name)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 121959
    })
})


In [3]:
# Print the first example
print(dataset['train'][0])

{'instruction': 'Create a function to calculate the sum of a sequence of integers.', 'input': '[1, 2, 3, 4, 5]', 'output': '# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a function to calculate the sum of a sequence of integers.\n\n### Input:\n[1, 2, 3, 4, 5]\n\n### Response:\n# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum'}


In [4]:
# Convert the training split to a pandas DataFrame
df = dataset['train'].to_pandas()

# Display the first 5 rows
df.head()

Unnamed: 0,instruction,input,output,prompt
0,Create a function to calculate the sum of a se...,"[1, 2, 3, 4, 5]",# Python code\ndef sum_sequence(sequence):\n ...,Below is an instruction that describes a task....
1,Develop a function that will add two strings,"str1 = ""Hello ""\nstr2 = ""world""","def add_strings(str1, str2):\n """"""This func...",Below is an instruction that describes a task....
2,Design a data structure in C++ to store inform...,,#include <map>\n#include <string>\n\nclass Gro...,Below is an instruction that describes a task....
3,Implement a sorting algorithm to sort a given ...,"[3, 1, 4, 5, 9, 0]",def bubble_sort(arr):\n n = len(arr)\n \n ...,Below is an instruction that describes a task....
4,Design a Swift application for tracking expens...,Not applicable,import UIKit\n\nclass ExpenseViewController: U...,Below is an instruction that describes a task....


In [6]:
# Filter out rows with empty responses and select the first 2000
subset = dataset['train'].filter(lambda example: len(example['output']) > 0).select(range(2000))

print(f"Using a subset of {len(subset)} examples.")

Filter:   0%|          | 0/121959 [00:00<?, ? examples/s]

Using a subset of 2000 examples.


In [7]:
# Split the subset into 90% training and 10% validation
processed_dataset = subset.train_test_split(test_size=0.1)

print(processed_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 200
    })
})


In [8]:
# Save the processed dataset to the '/data' directory
processed_dataset.save_to_disk("../data/code-explainer-dataset")

print("Processed dataset saved locally to '/data/code-explainer-dataset'")

Saving the dataset (0/1 shards):   0%|          | 0/1800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

Processed dataset saved locally to '/data/code-explainer-dataset'
