## Load Dataset

In [None]:
import torch
from datasets import load_dataset
import tiktoken
import re
from datasets import load_dataset

In [None]:
ds = load_dataset("jtatman/python-code-dataset-500k")

In [33]:
ds

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'system'],
        num_rows: 559515
    })
})

In [34]:
print(ds["train"][0])

{'output': 'Here is an example of a nested loop in Python to print every combination of numbers between 0-9, excluding any combination that contains the number 5 or repeating digits:\n\n```python\nfor i in range(10):  # First digit\n    for j in range(10):  # Second digit\n        for k in range(10):  # Third digit\n            # Checking for the conditions\n            if i != 5 and j != 5 and k != 5 and i != j and i != k and j != k:\n                print(i, j, k)\n```\n\nThis code will generate and print every combination of three digits between 0-9 that do not contain the number 5 and do not have any repeating digits.', 'instruction': 'Create a nested loop to print every combination of numbers between 0-9, excluding any combination that contains the number 5. Additionally, exclude any combination that contains a repeating digit. Implement the solution without using any built-in functions or libraries to check for repeating digits.', 'system': 'You are a Python code analyst, evaluat

## Checking Numbers of Unique Values

In [35]:
len(ds["train"].unique("output"))

493723

In [36]:
len(ds["train"].unique("instruction"))

494396

In [37]:
len(ds["train"].unique("system"))

27720

## Checking missing Values(Null/None)

In [38]:
for col in ds["train"].column_names:
    missing = ds["train"].filter(lambda x: x[col] is None)
    print(col, len(missing))

output 0
instruction 0
system 0


## Checking Empty Strings ("")

In [39]:
for col in ds["train"].column_names:
    missing = ds["train"].filter(lambda x: x[col]== "")
    print(col, len(missing))

output 0
instruction 0
system 22608


## Checking Duplicate Values

In [40]:
rows = ds["train"].to_pandas()
dup_count = rows.duplicated().sum()
print(dup_count)

26076


### Filing the empty strings in the system prompt 

In [41]:
def fill_system(example):
    value = example["system"]

    if (
        value is None or
        (isinstance(value, str) and value.strip() == "")
    ):
        example["system"] = "You are a helpful coding assistant."

    return example


ds["train"] = ds["train"].map(fill_system)


## Remove Duplicate Values

In [42]:
df = ds["train"].to_pandas()
df = df.drop_duplicates()
ds["train"] = ds["train"].from_pandas(df, preserve_index=False)

## Filter Outputs which have less than characters

In [43]:
ds["train"] = ds["train"].filter(
    lambda x: len(x["output"].strip()) > 5
)

Filter: 100%|██████████| 533439/533439 [00:02<00:00, 260250.14 examples/s]


## Strip whitespace

In [44]:
def clean(example):
    for k in ds["train"].column_names:
        if k in example and isinstance(example[k], str):
            example[k] = example[k].strip()
    return example

ds["train"] = ds["train"].map(clean)

Map: 100%|██████████| 533404/533404 [00:39<00:00, 13498.06 examples/s]


## Filter Outputs which have too many characters 

In [45]:
MAX_CHARS = 8000

ds["train"] = ds["train"].filter(
    lambda x: len(x["instruction"]) + len(x["output"]) < MAX_CHARS
)

Filter: 100%|██████████| 533404/533404 [00:02<00:00, 258922.57 examples/s]


## Shuffle Dataset

In [29]:
ds["train"] = ds["train"].shuffle(seed=42)

## Final Dataset Size

In [30]:
len(ds["train"])

526556

#### CONFIGURATION

In [None]:

MAX_LENGTH = 512
SPLIT = "train"

#### TOKENIZER

In [None]:
enc = tiktoken.get_encoding("gpt2")
eos_token = enc.eot_token

### FORMAT FUNCTION

In [None]:
def format_sample(row):
    instruction = row.get('instruction', '')
    raw_output = row.get('output', '')

    # Extract code inside ```python ... ```
    code_match = re.search(
        r'```python\s*(.*?)\s*```',
        raw_output,
        re.DOTALL
    )

    if code_match:
        clean_code = code_match.group(1).strip()
    else:
        clean_code = raw_output.strip()

    formatted_text = f'"""\n{instruction}\n"""\n{clean_code}'
    return formatted_text

### Tokenizer Function

In [None]:
def tokenize_sample(text, max_length):
    tokens = enc.encode(text)
    tokens.append(eos_token)

    # Truncate
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
    else:
        padding = [eos_token] * (max_length - len(tokens))
        tokens = tokens + padding

    # Create input-target pair
    x = torch.tensor(tokens[:-1], dtype=torch.long)
    y = torch.tensor(tokens[1:], dtype=torch.long)

    return x, y

###  BUILD TENSORS

In [None]:
# ----------  ----------



# ---------- TOKENIZE FUNCTION ----------



 ----------
inputs = []
targets = []

for row in dataset:
    text = format_sample(row)
    x, y = tokenize_sample(text, MAX_LENGTH)

    inputs.append(x)
    targets.append(y)

# Stack into tensors
X = torch.stack(inputs)
Y = torch.stack(targets)

print("Dataset ready:")
print("X shape:", X.shape)
print("Y shape:", Y.shape)
