# Load dataset

In [17]:
import json

# Load data directly from file
arc_challenge_file = '../src/arc-prize-2025/arc-agi_test_challenges.json'


# Load original data
arc_data = None
with open(arc_challenge_file, 'r') as f:
    arc_data = json.load(f)

list(arc_data.items())[0]

('00576224',
 {'train': [{'input': [[7, 9], [4, 3]],
    'output': [[7, 9, 7, 9, 7, 9],
     [4, 3, 4, 3, 4, 3],
     [9, 7, 9, 7, 9, 7],
     [3, 4, 3, 4, 3, 4],
     [7, 9, 7, 9, 7, 9],
     [4, 3, 4, 3, 4, 3]]},
   {'input': [[8, 6], [6, 4]],
    'output': [[8, 6, 8, 6, 8, 6],
     [6, 4, 6, 4, 6, 4],
     [6, 8, 6, 8, 6, 8],
     [4, 6, 4, 6, 4, 6],
     [8, 6, 8, 6, 8, 6],
     [6, 4, 6, 4, 6, 4]]}],
  'test': [{'input': [[3, 2], [7, 8]]}]})

# Formatter

In [26]:
class TurnipFormatter:
    def __init__(self, inp_prefix="Input: ", out_prefix="Output= ", sep="\n\n", row_sep="\n"):
        self.inp_prefix = inp_prefix
        self.out_prefix = out_prefix
        self.sep = sep
        self.row_sep = row_sep

    def array_to_str(self, array):
        return self.row_sep.join("".join(str(el) for el in row) for row in array)
    
    def fmt_example(self, example):
        inp = self.array_to_str(example['input'])
        out = self.array_to_str(example['output'])
        return f"{self.inp_prefix}{inp}{self.out_prefix}{out}"
    
    def fmt_train(self, train_data):
        return self.sep.join(self.fmt_example(ex) for ex in train_data)
    
    def fmt_query(self, test_input):
        inp = self.array_to_str(test_input)
        return f"{self.inp_prefix}{inp}{self.out_prefix}"
    
    def decode_output(self, output_str):
        rows = output_str.strip().split(self.row_sep)
        return [[int(char) for char in row] for row in rows if row]

In [27]:
formatter = TurnipFormatter()

task = arc_data["00576224"]
train_str = formatter.fmt_train(task['train'])
query_str = formatter.fmt_query(task['test'][0]['input'])

full_prompt = train_str + formatter.sep + query_str

print(full_prompt)

Input: 79
43Output= 797979
434343
979797
343434
797979
434343

Input: 86
64Output= 868686
646464
686868
464646
868686
646464

Input: 32
78Output= 


In [29]:
model_output = """
326326
748748
263263
874874
326326
748748
"""

decoded = formatter.decode_output(model_output)
print("\n=== Decoded Output ===")
for row in decoded:
    print(row)


=== Decoded Output ===
[3, 2, 6, 3, 2, 6]
[7, 4, 8, 7, 4, 8]
[2, 6, 3, 2, 6, 3]
[8, 7, 4, 8, 7, 4]
[3, 2, 6, 3, 2, 6]
[7, 4, 8, 7, 4, 8]


# Load Model

In [None]:
from unsloth import FastLanguageModel

model_weights_folder = "../src/"

model, tokenizer = FastLanguageModel.from_pretrained(
    report_to=None, 
    model_name=model, 
    dtype=None, 
    load_in_4bit=True, 
    local_files_only=True)

To train model, it needs:
1. model
2. tokenizer
3. dataset (in string)

In [None]:
from unsloth import FastLanguageModel
from unsloth import UnslothTrainingArguments as TrainingArguments
from unsloth import is_bfloat16_supported
FastLanguageModel.for_training(model)
add_train_args.update(fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported())


from unsloth import UnslothTrainer as Trainer
trainer = Trainer(
    model=model,
    tokenizer=formatter.tokenizer,
    train_dataset=Dataset.from_list(dataset.as_list(formatter)),
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=None,
    args=TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        max_steps = 60, # num_train_epochs = 1,
        learning_rate = 2e-4,
    ),
)

from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)