In [None]:
%env HF_DATASETS_OFFLINE=1
%env HF_HUB_OFFLINE=1
%env TRANSFORMERS_OFFLINE=1
%env TOKENIZERS_PARALLELISM=false

In [None]:
deps_path = "/kaggle/input/unsloth-library-install-v2"

In [None]:
%%capture
! pip install --no-index --find-links {deps_path} pip3-autoremove -y
! pip-autoremove torch -y
! pip install --no-index --find-links {deps_path} torch
! pip install --no-index --find-links {deps_path} triton
! pip install --no-index --find-links {deps_path} "unsloth[kaggle-new]"

In [None]:
%%capture
deps_path_2 = '/kaggle/input/llama-3-arc-deps'
! pip install --no-index --find-links {deps_path_2} --requirement {deps_path_2}/requirements.txt

In [None]:
BASE_PATH = "/kaggle/input"
MODEL_ID = "/kaggle/input/gemma-2-2b-it-baseline/pytorch/default/3/home/stepan/kaggle-arc-agi/models/gemma-2-2b-it/baseline"
MAX_NEW_TOKENS = 2048
MAX_SEQ_LENGTH = 8192 - MAX_NEW_TOKENS

In [None]:
import json
import re

import torch  # type: ignore
import numpy as np  # type: ignore

from datasets import DatasetDict, Dataset  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig  # type: ignore

In [None]:
def get_model_tokenizer():
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
    config = AutoConfig.from_pretrained(MODEL_ID, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        local_files_only=True,
        config=config,
    )

    model.eval()

    return model, tokenizer

In [None]:
model, tokenizer = get_model_tokenizer()

In [None]:
# Load data from JSON files
def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data


# Function to calculate the number of tokens in a text
def count_tokens(text):
    """
    Calculate the number of tokens in a given text using the tokenizer.

    Parameters:
    text (str): The input text to be tokenized.

    Returns:
    int: The number of tokens in the input text.
    """
    return len(tokenizer.encode(text))


def split_train_examples(train_examples, max_size=4096 - 32):
    total_size = sum(
        len(example["input"]) * len(example["input"][0]) + len(example["output"]) * len(example["output"][0]) for example in train_examples
    )
    if total_size <= max_size:
        return [train_examples]

    split_size = max(1, max_size // total_size)
    return [train_examples[i : i + split_size] for i in range(0, len(train_examples), split_size)]


def to_dataset(data, solutions=None, fit_dataset=False):
    restructured_data = {
        "id": [],
        "challenge": [],
    }
    if solutions is not None:
        restructured_data["solution"] = []

    for challenge_id, challenge_data in data.items():  # for all challenges
        for test_id, task in enumerate(
            challenge_data["test"]
        ):  # for all test tasks in this challenge we want to expand dataset so that each test task is separate dataset record
            if fit_dataset:
                for split_id, split_train in enumerate(
                    split_train_examples(challenge_data["train"])
                ):  # if fit_dataset is true, we split each training example into multiple records so that each record has less than MAX_SEQ_LENGTH tokens
                    restructured_data["id"].append(challenge_id)
                    restructured_data["challenge"].append({"train": split_train, "test": task, "order": test_id})
                    if solutions is not None:
                        restructured_data["solution"].append(solutions[challenge_id][test_id])
            else:
                restructured_data["id"].append(challenge_id)
                restructured_data["challenge"].append({"train": challenge_data["train"], "test": task, "order": test_id})
                if solutions is not None:
                    restructured_data["solution"].append(solutions[challenge_id][test_id])

    return Dataset.from_dict(restructured_data)

In [None]:
def prepare_inputs(dct):
    input_str = "\n".join("".join(map(str, row)) for row in dct["input"])
    output_str = "\n".join("".join(map(str, row)) for row in dct["output"]) if "output" in dct else ""
    text = f"<input>\n{input_str}\n</input>\n\n<output>\n{output_str}\n</output>"
    return text

In [None]:
def prepare_dataset(tokenizer, use_system_prompt=False, fit_dataset=False):
    # The system_prompt defines the initial instructions for the model, setting the context for solving ARC tasks.
    system_prompt = (
        """You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet."""
    )

    # User message template is a template for creating user prompts. It includes placeholders for training data and test input data, guiding the model to learn the rule and apply it to solve the given puzzle.
    user_message_template = """Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
-----------------
{training_data}
-----------------
Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
-----------------
{input_test_data}
-----------------
What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:"""

    # Load all datasets
    training_challenges = load_data(f"{BASE_PATH}/arc-prize-2024/arc-agi_training_challenges.json")
    training_solutions = load_data(f"{BASE_PATH}/arc-prize-2024/arc-agi_training_solutions.json")
    evaluation_challenges = load_data(f"{BASE_PATH}/arc-prize-2024/arc-agi_evaluation_challenges.json")
    evaluation_solutions = load_data(f"{BASE_PATH}/arc-prize-2024/arc-agi_evaluation_solutions.json")
    test_challenges = load_data(f"{BASE_PATH}/arc-prize-2024/arc-agi_test_challenges.json")

    train_dataset = to_dataset(training_challenges, training_solutions, fit_dataset=fit_dataset)
    eval_dataset = to_dataset(evaluation_challenges, evaluation_solutions, fit_dataset=fit_dataset)
    pred_dataset = to_dataset(test_challenges, fit_dataset=fit_dataset)

    def create_chat(challenge, solution=None):
        user_content = user_message_template.format(
            training_data="\n\n".join([prepare_inputs(ex) for ex in challenge["train"]]),
            input_test_data=prepare_inputs(challenge["test"]),
        )

        if use_system_prompt:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
            ]
        else:
            messages = [{"role": "user", "content": f"{system_prompt}\n\n{user_content}"}]

        if solution:
            messages.append(
                {
                    "role": "assistant",
                    "content": "<output>\n" + "\n".join("".join(map(str, row)) for row in solution) + "\n</output>",
                }
            )

        return messages

    def process_dataset(examples, solutions=None):
        # Create messages for each challenge-solution pair
        chats = []
        for challenge, solution in zip(examples["challenge"], solutions or [None] * len(examples["challenge"])):
            chat = create_chat(challenge, solution)
            chats.append(chat)

        # Apply chat template to each message
        texts = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in chats]

        return {"texts": texts, "messages": chats}

    train_dataset = train_dataset.map(lambda x: process_dataset(x, train_dataset["solution"]), batched=True)
    pred_dataset = pred_dataset.map(lambda x: process_dataset(x), batched=True)

    eval_dataset = eval_dataset.map(lambda x: process_dataset(x, eval_dataset["solution"]), batched=True)
    test_dataset = eval_dataset.train_test_split(test_size=0.3)

    dataset = DatasetDict(
        {
            "train": train_dataset,
            "test": test_dataset["train"],
            "val": test_dataset["test"],
            "predict": pred_dataset,
        }
    )

    return dataset

In [None]:
dataset = prepare_dataset(tokenizer, fit_dataset=True)
dataset

In [None]:
def gpu_stats(device_id=0):
    # @title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(device_id)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    return {
        "gpu": gpu_stats.name,
        "max_memory": max_memory,
        "start_gpu_memory": start_gpu_memory,
    }

In [None]:
def parse_output(text):
    # Extract the content inside <output></output> tags
    output_match = re.search(r"<output>(.*?)</output>", text, re.DOTALL)
    if not output_match:
        return None

    output_content = output_match.group(1).strip()

    # Split the content into lines and convert each line to a list of single-digit integers
    try:
        grid = []
        for line in output_content.split("\n"):
            row = [int(char) for char in line.strip() if char.isdigit()]
            if row:
                grid.append(row)

        # Ensure all rows have the same length
        if grid and all(len(row) == len(grid[0]) for row in grid):
            return grid
        else:
            return None
    except ValueError:
        return None


def tensor_to_int(value):
    if isinstance(value, torch.Tensor):
        return tensor_to_int(value.item())
    elif isinstance(value, list):
        return [tensor_to_int(item) for item in value]
    else:
        return value


def calculate_partial_match(pred, label):
    if not isinstance(pred, list) or not isinstance(label, list):
        return 0  # No match if either is not a list

    if len(pred) != len(label):
        return 0  # No match if outer dimensions differ

    total_elements = 0
    correct_elements = 0

    for p_row, l_row in zip(pred, label):
        if not isinstance(p_row, list) or not isinstance(l_row, list) or len(p_row) != len(l_row):
            return 0  # No match if any row is not a list or dimensions differ

        total_elements += len(l_row)
        correct_elements += sum(p == l for p, l in zip(p_row, l_row))

    return correct_elements / total_elements if total_elements > 0 else 0


def calculate_metrics(preds, labels):
    total_samples = len(labels)

    correct = sum(1 for p, l in zip(preds, labels) if p == l)
    accuracy = correct / total_samples

    partial_match_scores = [calculate_partial_match(p, l) if p is not None else 0 for p, l in zip(preds, labels)]

    avg_partial_match = sum(partial_match_scores) / total_samples

    return accuracy, avg_partial_match

In [None]:
def collate(mode, tokenizer):
    def collate_fn(batch):
        # Separate the different components of the batch
        ids = [item["id"] for item in batch]
        challenges = [item["challenge"] for item in batch]

        # For 'test' mode, remove the last assistant message from each entry
        if mode == "test":
            messages = [
                item["messages"][:-1] for item in batch
            ]  # last message is always assistant message - solution, we don't need it for evaluation
        else:
            messages = [item["messages"] for item in batch]

        # Tokenize the texts
        encodings = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            padding=True,
            # truncation=True
        )

        # If 'solution' is present (for training/validation data)
        if "solution" in batch[0]:
            solutions = [item["solution"] for item in batch]
            return {
                "id": ids,
                "challenge": challenges,
                "solution": solutions,
                "input_ids": encodings["input_ids"].to("cuda"),
                "attention_mask": encodings["attention_mask"].to("cuda"),
            }
        else:
            return {
                "id": ids,
                "challenge": challenges,
                "input_ids": encodings["input_ids"].to("cuda"),
                "attention_mask": encodings["attention_mask"].to("cuda"),
            }

    return collate_fn

In [None]:
def generate_with_temp(model, inputs, temperature):
    outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=temperature, top_k=50, use_cache=True)
    return outputs


def evaluate_batch(model, tokenizer, batch):
    inputs = {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"],
    }

    with torch.no_grad():
        outputs1 = generate_with_temp(model, inputs, 0.3)
        outputs2 = generate_with_temp(model, inputs, 0.7)

    input_ids_length = inputs["input_ids"].shape[1]  # sequence length without new tokens
    new_tokens1 = outputs1[:, input_ids_length:]
    new_tokens2 = outputs2[:, input_ids_length:]

    generated_texts1 = tokenizer.batch_decode(new_tokens1, skip_special_tokens=True)
    generated_texts2 = tokenizer.batch_decode(new_tokens2, skip_special_tokens=True)

    return generated_texts1, generated_texts2

In [None]:
def predict(model, tokenizer, dataset, batch_size):
    eval_dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate(mode="predict", tokenizer=tokenizer),
    )

    challenge_ids = []
    preds = []
    for i, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
        generated_texts1, generated_texts2 = evaluate_batch(model, tokenizer, batch)

        ids = batch["id"]
        challenges = batch["challenge"]

        for gen_text1, gen_text2, challenge_id, challenge in zip(generated_texts1, generated_texts2, ids, challenges):
            parsed_output1 = parse_output(gen_text1)
            parsed_output2 = parse_output(gen_text2)

            if parsed_output1 is None and parsed_output2 is None:
                print(f"Failed to parse both outputs: {gen_text1} and {gen_text2}")
                preds.append({"attempt_1": [[0]], "attempt_2": [[0]]})
            else:
                parsed_output1 = parsed_output1 if parsed_output1 is not None else [[0]]
                parsed_output2 = parsed_output2 if parsed_output2 is not None else [[0]]
                preds.append({"attempt_1": parsed_output1, "attempt_2": parsed_output2})
            challenge_ids.append((challenge_id, challenge["order"]))
    return {"ids": challenge_ids, "preds": preds}

In [None]:
def group_preds_by_challenge_id(challenge_ids, preds):
    grouped_preds = {}
    for (challenge_id, order), pred in zip(challenge_ids, preds):
        if challenge_id not in grouped_preds:
            grouped_preds[challenge_id] = []

        # Check if we already have a prediction for this order
        existing_pred = next((p for p in grouped_preds[challenge_id] if p[0] == order), None)

        if existing_pred:
            # If we have a duplicate (same id and order), choose any (here, we keep the first one)
            continue
        else:
            # Add the new prediction with its order
            grouped_preds[challenge_id].append((order, pred))

    # Sort predictions by order for each challenge_id
    for challenge_id in grouped_preds:
        grouped_preds[challenge_id].sort(key=lambda x: x[0])
        # Remove the order information, keeping only the predictions
        grouped_preds[challenge_id] = [pred for _, pred in grouped_preds[challenge_id]]

    return grouped_preds

In [None]:
pred_results = predict(model, tokenizer, dataset["predict"], batch_size=1)
grouped_preds = group_preds_by_challenge_id(pred_results["ids"], pred_results["preds"])

In [None]:
len(grouped_preds)

In [None]:
# compare solutions with sample_submission.json
with open(f"{BASE_PATH}/arc-prize-2024/sample_submission.json", "r") as json_file:
    sample_submission = json.load(json_file)

# Check if all challenge_ids in sample_submission are in grouped_preds, and all tests have correct number of predictions
# also check if all predictions are 2d matrices of at least 1x1 size
for challenge_id in sample_submission:
    if challenge_id not in grouped_preds:
        print(f"Challenge ID {challenge_id} in sample_submission is not in grouped_preds.")
    elif len(grouped_preds[challenge_id]) != len(sample_submission[challenge_id]):
        print(
            f"Challenge ID {challenge_id} in sample_submission has {len(sample_submission[challenge_id])} predictions, but grouped_preds has {len(grouped_preds[challenge_id])}."
        )

    for pred in grouped_preds[challenge_id]:
        if not isinstance(pred, dict):
            print(f"Challenge ID {challenge_id} in sample_submission has invalid predictions: {pred}")
            continue
        if not isinstance(pred["attempt_1"], list) or not isinstance(pred["attempt_2"], list):
            print(f"Challenge ID {challenge_id} in sample_submission has invalid predictions: {pred}")
        if pred["attempt_1"] is None or pred["attempt_2"] is None:
            print(f"Challenge ID {challenge_id} in sample_submission has invalid predictions: {pred}")
        elif pred["attempt_1"] is None or len(pred["attempt_1"]) < 1 or len(pred["attempt_1"][0]) < 1:
            print(f"Challenge ID {challenge_id} in sample_submission has invalid predictions: {pred['attempt_1']}")
        elif pred["attempt_2"] is None or len(pred["attempt_2"]) < 1 or len(pred["attempt_2"][0]) < 1:
            print(f"Challenge ID {challenge_id} in sample_submission has invalid predictions: {pred['attempt_2']}")

In [None]:
with open("submission.json", "w") as json_file:
    json.dump(grouped_preds, json_file)