In [1]:
%env CUDA_VISIBLE_DEVICES=1
%env TOKENIZERS_PARALLELISM=false

env: CUDA_VISIBLE_DEVICES=1
env: TOKENIZERS_PARALLELISM=false


In [2]:
BASE_PATH = "/home/stepan/kaggle-arc-agi"
MODEL_ID = f"{BASE_PATH}/models/llama-3_2-3b-it"
MAX_NEW_TOKENS = 2048
MAX_SEQ_LENGTH = 32768 - MAX_NEW_TOKENS

In [3]:
import sys

sys.path.append(BASE_PATH)
sys.path.append(f"{BASE_PATH}/scripts")

In [4]:
import json

import torch  # type: ignore
import numpy as np  # type: ignore

from datasets import DatasetDict, Dataset  # type: ignore

from unsloth import FastLanguageModel  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from logger import get_logger  # type: ignore
import train_utils  # type: ignore
import data_utils  # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
log = get_logger(f"{BASE_PATH}/logs/llama-3_2-3b-it", "arc-agi")

In [6]:
def get_model_tokenizer(dtype=None, load_in_4bit=True):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_ID,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        attn_implementation="flash_attention_2",
        device_map="auto",
        max_memory={0: "23GiB", "cpu": "16GiB"},
    )

    return model, tokenizer

In [7]:
def eval(f):
    def wrapper(model, tokenizer, *args, **kwargs):
        FastLanguageModel.for_inference(model)
        return f(model, tokenizer, *args, **kwargs)

    return wrapper

In [8]:
model, tokenizer = get_model_tokenizer()

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.9.post3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:
def prepare_inputs(dct, prepare_solution=False):
    if prepare_solution:
        return "<output>\n" + "\n".join(" ".join(map(str, row)) for row in dct) + "\n</output>"
    else:
        input_str = "\n".join(" ".join(map(str, row)) for row in dct["input"])
        output_str = "\n".join(" ".join(map(str, row)) for row in dct["output"]) if "output" in dct else ""
        text = f"<input>\n{input_str}\n</input>\n\n<output>\n{output_str}\n</output>"
        return text

In [10]:
def to_augmentation_dataset(data):
    restructured_data = {
        "id": [],
        "challenge": [],
        "solution": [],
    }

    for challenge_id, challenge_data in data.items():  # for all challenges
        for train_id, task in enumerate(challenge_data["train"]):
            restructured_data["id"].append(challenge_id)
            restructured_data["challenge"].append(
                {"train": challenge_data["train"][:train_id] + challenge_data["train"][train_id + 1 :], "test": {"input": task["input"]}, "order": train_id}
            )
            restructured_data["solution"].append(task["output"])

    return Dataset.from_dict(restructured_data)

In [11]:
def create_chat(challenge, solution=None):
    user_content = data_utils.BASIC_PROMPT.format(
        training_data="\n\n".join([prepare_inputs(ex) for ex in challenge["train"]]),
        input_test_data=prepare_inputs(challenge["test"]),
    )

    messages = [
        {"role": "system", "content": data_utils.SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]

    if solution:
        messages.append(
            {
                "role": "assistant",
                "content": prepare_inputs(solution, prepare_solution=True),
            }
        )

    return messages

In [12]:
def prepare_dataset(tokenizer, base_path=None, final_training=False, create_chat_func=create_chat):
    # Load all datasets
    training_challenges = data_utils.load_data(f"{base_path}/arc-prize-2024/arc-agi_training_challenges.json")
    training_solutions = data_utils.load_data(f"{base_path}/arc-prize-2024/arc-agi_training_solutions.json")
    evaluation_challenges = data_utils.load_data(f"{base_path}/arc-prize-2024/arc-agi_evaluation_challenges.json")
    evaluation_solutions = data_utils.load_data(f"{base_path}/arc-prize-2024/arc-agi_evaluation_solutions.json")
    test_challenges = data_utils.load_data(f"{base_path}/arc-prize-2024/arc-agi_test_challenges.json")

    train_dataset = to_augmentation_dataset(training_challenges)
    eval_dataset = to_augmentation_dataset(evaluation_challenges)
    pred_dataset = to_augmentation_dataset(test_challenges)

    def process_dataset(examples, solutions=None):
        # Create messages for each challenge-solution pair
        chats = []
        for challenge, solution in zip(examples["challenge"], solutions or [None] * len(examples["challenge"])):
            chat = create_chat_func(challenge, solution)
            chats.append(chat)

        # Apply chat template to each message
        texts = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in chats]

        return {"texts": texts, "messages": chats}

    pred_dataset = pred_dataset.map(lambda x: process_dataset(x), batched=True)
    train_dataset = train_dataset.map(lambda x: process_dataset(x, train_dataset["solution"]), batched=True)
    eval_dataset = eval_dataset.map(lambda x: process_dataset(x, eval_dataset["solution"]), batched=True)

    if final_training:  # if final training, we need to add the validation dataset to the training dataset
        train_dataset = data_utils.concatenate_datasets([train_dataset, eval_dataset]).shuffle(seed=42)
        return DatasetDict(
            {
                "train": train_dataset,
                "predict": pred_dataset,
            }
        )

    dataset = DatasetDict(
        {
            "train": train_dataset,
            "test": eval_dataset,
            "predict": pred_dataset,
        }
    )

    return dataset

In [13]:
dataset = prepare_dataset(tokenizer, base_path=BASE_PATH, create_chat_func=create_chat)
dataset

Map: 100%|██████████| 323/323 [00:00<00:00, 1864.40 examples/s]
Map: 100%|██████████| 1302/1302 [00:00<00:00, 1424.12 examples/s]
Map: 100%|██████████| 1363/1363 [00:01<00:00, 1030.76 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 1302
    })
    test: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 1363
    })
    predict: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 323
    })
})

In [14]:
def check_challenge_order_consistency(dataset):
    id_order_map = {}
    for item in dataset:
        challenge_id = item['id']
        order = item['challenge']['order']
        
        if challenge_id not in id_order_map:
            id_order_map[challenge_id] = set()
        
        id_order_map[challenge_id].add(order)
    
    inconsistencies = []
    for challenge_id, orders in id_order_map.items():
        expected_orders = set(range(len(orders)))
        if orders != expected_orders:
            inconsistencies.append(f"Challenge ID {challenge_id}: Missing orders {expected_orders - orders}")
    
    if inconsistencies:
        print("Inconsistencies found:")
        for inc in inconsistencies:
            print(inc)
    else:
        print("All challenge IDs have consistent order values.")

In [15]:
for split in ['train', 'test', 'predict']:
    check_challenge_order_consistency(dataset[split])

All challenge IDs have consistent order values.
All challenge IDs have consistent order values.
All challenge IDs have consistent order values.


In [16]:
def generate_with_temp(model, inputs, temperature):
    outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=temperature, top_k=50, use_cache=True)
    return outputs


def evaluate_batch(model, tokenizer, batch):
    inputs = {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"],
    }

    with torch.no_grad():
        outputs = generate_with_temp(model, inputs, 0.5)

    input_ids_length = inputs["input_ids"].shape[1]  # sequence length without new tokens
    new_tokens = outputs[:, input_ids_length:]

    generated_texts = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

    return generated_texts

In [17]:
def error_correction(pred, label):
    if pred is None:
        return None, "Output is not in the correct format"

    output = [[str(cell) for cell in row] for row in pred]

    if pred == label:
        return output, "Output is correct"

    if len(pred) != len(label) or any(len(p) != len(l) for p, l in zip(pred, label)):
        return output, "Output shape is wrong"

    for i in range(len(pred)):
        for j in range(len(pred[i])):
            if pred[i][j] != label[i][j]:
                output[i][j] = f"({pred[i][j]})->({label[i][j]})"

    return output, "Output has errors"

In [18]:
@eval
def predict(model, tokenizer, dataset, batch_size):
    eval_dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=train_utils.collate(mode="test", tokenizer=tokenizer),
    )

    challenge_ids = []
    preds = []
    for i, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
        generated_texts = evaluate_batch(model, tokenizer, batch)

        ids = batch["id"]
        challenges = batch["challenge"]
        solutions = batch["solution"]

        for gen_text, challenge_id, challenge, solution in zip(generated_texts, ids, challenges, solutions):
            parsed_output = train_utils.parse_output(gen_text)
            challenge_ids.append((challenge_id, challenge["order"]))
            preds.append(error_correction(parsed_output, solution))
    return {"ids": challenge_ids, "preds": preds}

In [19]:
train_results = predict(model, tokenizer, dataset["train"], batch_size=1)
train_results

100%|██████████| 1302/1302 [3:23:16<00:00,  9.37s/it]  


{'ids': [('007bbfb7', 0),
  ('007bbfb7', 1),
  ('007bbfb7', 2),
  ('007bbfb7', 3),
  ('007bbfb7', 4),
  ('00d62c1b', 0),
  ('00d62c1b', 1),
  ('00d62c1b', 2),
  ('00d62c1b', 3),
  ('00d62c1b', 4),
  ('017c7c7b', 0),
  ('017c7c7b', 1),
  ('017c7c7b', 2),
  ('025d127b', 0),
  ('025d127b', 1),
  ('045e512c', 0),
  ('045e512c', 1),
  ('045e512c', 2),
  ('0520fde7', 0),
  ('0520fde7', 1),
  ('0520fde7', 2),
  ('05269061', 0),
  ('05269061', 1),
  ('05269061', 2),
  ('05f2a901', 0),
  ('05f2a901', 1),
  ('05f2a901', 2),
  ('06df4c85', 0),
  ('06df4c85', 1),
  ('06df4c85', 2),
  ('08ed6ac7', 0),
  ('08ed6ac7', 1),
  ('09629e4f', 0),
  ('09629e4f', 1),
  ('09629e4f', 2),
  ('09629e4f', 3),
  ('0962bcdd', 0),
  ('0962bcdd', 1),
  ('0a938d79', 0),
  ('0a938d79', 1),
  ('0a938d79', 2),
  ('0a938d79', 3),
  ('0b148d64', 0),
  ('0b148d64', 1),
  ('0b148d64', 2),
  ('0ca9ddb6', 0),
  ('0ca9ddb6', 1),
  ('0ca9ddb6', 2),
  ('0d3d703e', 0),
  ('0d3d703e', 1),
  ('0d3d703e', 2),
  ('0d3d703e', 3),
  ('0

In [None]:
test_results = predict(model, tokenizer, dataset["test"], batch_size=1)

In [20]:
def transform_results(results):
    ids = results["ids"]
    preds = results["preds"]

    output = {}
    for id_order, pred in zip(ids, preds):
        challenge_id, order = id_order
        if challenge_id not in output:
            output[challenge_id] = {}
        output[challenge_id][order] = pred

    return output

In [21]:
# Save to file
# with open(f"{BASE_PATH}/data/train_corrections.json", "w") as f:
#     json.dump(transform_results(train_results), f, indent=4)

# with open(f"{BASE_PATH}/data/test_corrections.json", "w") as f:
#     json.dump(transform_results(test_results), f, indent=4)

In [25]:
with open(f"{BASE_PATH}/data/train_corrections.json", "r") as f:
    train_corrections = json.load(f)

with open(f"{BASE_PATH}/data/test_corrections.json", "r") as f:
    test_corrections = json.load(f)

In [26]:
# count types of responses
train_correction_counts = {}
test_correction_counts = {}

for challenge_id, corrections in train_corrections.items():
    for order, correction in corrections.items():
        train_correction_counts[correction[1]] = train_correction_counts.get(correction[1], 0) + 1

for challenge_id, corrections in test_corrections.items():
    for order, correction in corrections.items():
        test_correction_counts[correction[1]] = test_correction_counts.get(correction[1], 0) + 1

In [27]:
train_correction_counts, test_correction_counts

({'Output has errors': 816,
  'Output shape is wrong': 349,
  'Output is correct': 84,
  'Output is not in the correct format': 53},
 {'Output has errors': 806,
  'Output shape is wrong': 440,
  'Output is not in the correct format': 93,
  'Output is correct': 24})

In [None]:
# find train instance with 30x30 input size:
for i, train_instance in enumerate(dataset["train"]):
    if len(train_instance['challenge']['test']['output']) == 30 and len(train_instance['challenge']['test']['output'][0]) == 30:
        print(i)
        break

In [54]:
def build_augmented_dataset(dataset, train_corrections, test_corrections):
    train_augmented_dataset = []
    test_augmented_dataset = []
    for train_instance in dataset["train"]:
        challenge = train_instance['challenge'].copy()
        challenge_id = train_instance['id']
        order = challenge['order']
        
        correction = train_corrections[challenge_id][str(order)]
        
        challenge['correction'] = {}
        challenge['correction']['output'] = correction[0]
        challenge['correction']['message'] = correction[1]
        
        train_augmented_dataset.append({'id': challenge_id, 'challenge': challenge})
        
    for test_instance in dataset["test"]:
        challenge = test_instance['challenge'].copy()
        challenge_id = test_instance['id']
        order = challenge['order']
        
        correction = test_corrections[challenge_id][str(order)]
        
        challenge['correction'] = {}
        challenge['correction']['output'] = correction[0]
        challenge['correction']['message'] = correction[1]

        test_augmented_dataset.append({'id': challenge_id, 'challenge': challenge})
    
    return DatasetDict({'train': Dataset.from_list(train_augmented_dataset), 'test': Dataset.from_list(test_augmented_dataset)})

In [None]:
augmented_dataset = build_augmented_dataset(dataset, train_corrections, test_corrections)
augmented_dataset

In [None]:
print(tokenizer.apply_chat_template(
    dataset['test'][0]['messages'][:-1],
    tokenize=False,
    add_generation_prompt=True
))

In [None]:
def create_chat_with_correction(challenge, solution=None):
    user_content = data_utils.BASIC_PROMPT.format(
        training_data="\n\n".join([prepare_inputs(ex) for ex in challenge["train"]]),
        errors_corrections=challenge["correction"]["message"], # TODO: add this to the prompt
        input_test_data=prepare_inputs(challenge["test"]),
    )

    messages = [
        {"role": "system", "content": data_utils.SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]

    if solution:
        messages.append(
            {
                "role": "assistant",
                "content": prepare_inputs(solution, prepare_solution=True),
            }
        )

    return messages