In [1]:
! nvidia-smi

Thu Sep 19 18:15:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A5000               On  | 00000000:2B:00.0 Off |                  Off |
| 30%   33C    P8              20W / 230W |    244MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A5000               On  | 00000000:41:00.0 Off |  

In [1]:
BASE_PATH = '/home/stepan/kaggle-arc-agi'

In [2]:
# %env CUDA_VISIBLE_DEVICES=0,1

In [3]:
import os

print(os.getenv('CUDA_VISIBLE_DEVICES'))
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

None


In [4]:
import sys
sys.path.append(BASE_PATH)
sys.path.append(f'{BASE_PATH}/scripts')

In [5]:
import json
import re

from datasets import DatasetDict, Dataset # type: ignore

from tqdm.auto import tqdm # type: ignore

import transformers # type: ignore
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # type: ignore

import torch # type: ignore

from logger import get_logger # type: ignore

In [6]:
log = get_logger(f'{BASE_PATH}/logs/gemma-2-2b', 'arc-agi')

In [7]:
MODEL_ID = f"{BASE_PATH}/models/gemma-2-2b-it/checkpoint-500"
# MODEL_ID = "unsloth/gemma-2-2b-it"

In [8]:
def get_model_tokenizer(max_seq_length, load_in_4bit=True):
    quantization_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side='left')
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, 
        # quantization_config=quantization_config,
        # attn_implementation='flash_attention_2',
        torch_dtype="auto",
        device_map="cuda:0",
    )
    
    # model.generation_config.cache_implementation = "static"

    # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

    return model, tokenizer

# def get_model_tokenizer(max_seq_length):
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         # model_name="unsloth/gemma-2-2b-it",
#         model_name=f"{BASE_PATH}/models/gemma-2-2b-it/checkpoint-500",
#         max_seq_length=max_seq_length,
#         dtype=torch.bfloat16,
#         load_in_4bit=True,
#         device_map={'': 0},
#         # attn_implementation='flash_attention_2',
#         # token = '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', # use one if using gated models like meta-llama/Llama-2-7b-hf
#     )
#     return model, tokenizer

In [9]:
MAX_NEW_TOKENS = 2048
MAX_SEQ_LENGTH = 8192 - MAX_NEW_TOKENS
MAX_SEQ_LENGTH

6144

In [10]:
model, tokenizer = get_model_tokenizer(MAX_SEQ_LENGTH)
model

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2304, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2304, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
   

In [11]:
# Load data from JSON files
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def split_train_examples(train_examples, max_size=4096-32):
    total_size = sum(len(example['input']) * len(example['input'][0]) + len(example['output']) * len(example['output'][0]) for example in train_examples)
    if total_size <= max_size:
        return [train_examples]
    
    split_size = max(1, max_size // total_size)
    return [train_examples[i:i+split_size] for i in range(0, len(train_examples), split_size)]

def to_dataset(data, solutions=None, fit_dataset=False):
    restructured_data = {
        'id': [],
        'challenge': [],
    }
    if solutions is not None:
        restructured_data['solution'] = []

    for challenge_id, challenge_data in data.items(): # for all challenges
        for test_id, task in enumerate(challenge_data['test']): # for all test tasks in this challenge we want to expand dataset so that each test task is separate dataset record
            if fit_dataset:
                for split_id, split_train in enumerate(split_train_examples(challenge_data['train'])): # if fit_dataset is true, we split each training example into multiple records so that each record has less than MAX_SEQ_LENGTH tokens
                    restructured_data['id'].append(challenge_id)
                    restructured_data['challenge'].append({'train': split_train, 'test': task, 'order': test_id})
                    if solutions is not None:
                        restructured_data['solution'].append(solutions[challenge_id][test_id])
            else:
                restructured_data['id'].append(challenge_id)
                restructured_data['challenge'].append({'train': challenge_data['train'], 'test': task, 'order': test_id})
                if solutions is not None:
                    restructured_data['solution'].append(solutions[challenge_id][test_id])

    return Dataset.from_dict(restructured_data)

In [12]:
def prepare_inputs(dct):
    input_str = '\n'.join(''.join(map(str, row)) for row in dct["input"])
    output_str = '\n'.join(''.join(map(str, row)) for row in dct["output"]) if "output" in dct else ""
    text = f'<input>\n{input_str}\n</input>\n\n<output>\n{output_str}\n</output>'
    return text

In [13]:
def prepare_dataset(tokenizer, use_system_prompt=False, fit_dataset=False):
    # The system_prompt defines the initial instructions for the model, setting the context for solving ARC tasks.
    system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''

    # User message template is a template for creating user prompts. It includes placeholders for training data and test input data, guiding the model to learn the rule and apply it to solve the given puzzle.
    user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
-----------------
{training_data}
-----------------
Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
-----------------
{input_test_data}
-----------------
What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
    
    # Load all datasets
    training_challenges = load_data(f'{BASE_PATH}/arc-prize-2024/arc-agi_training_challenges.json')
    training_solutions = load_data(f'{BASE_PATH}/arc-prize-2024/arc-agi_training_solutions.json')
    evaluation_challenges = load_data(f'{BASE_PATH}/arc-prize-2024/arc-agi_evaluation_challenges.json')
    evaluation_solutions = load_data(f'{BASE_PATH}/arc-prize-2024/arc-agi_evaluation_solutions.json')
    test_challenges = load_data(f'{BASE_PATH}/arc-prize-2024/arc-agi_test_challenges.json')
    
    train_dataset = to_dataset(training_challenges, training_solutions, fit_dataset=fit_dataset)
    eval_dataset = to_dataset(evaluation_challenges, evaluation_solutions, fit_dataset=fit_dataset)
    pred_dataset = to_dataset(test_challenges, fit_dataset=fit_dataset)

    def create_chat(challenge, solution=None):
        user_content = user_message_template.format(
            training_data='\n\n'.join([prepare_inputs(ex) for ex in challenge['train']]),
            input_test_data=prepare_inputs(challenge['test'])
        )
        
        if use_system_prompt:
            messages = [
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_content}
            ]
        else:
            messages = [{'role': 'user', 'content': f"{system_prompt}\n\n{user_content}"}]
        
        if solution:
            messages.append({'role': 'assistant', 'content': "<output>\n" + '\n'.join(''.join(map(str, row)) for row in solution) + "\n</output>"})
        
        return messages

    def process_dataset(examples, solutions=None):
        # Create messages for each challenge-solution pair
        chats = []
        for challenge, solution in zip(examples['challenge'], solutions or [None] * len(examples['challenge'])):
            chat = create_chat(challenge, solution)
            chats.append(chat)
        
        # Apply chat template to each message
        texts = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) for chat in chats]
        
        return {
            'texts': texts,
            'messages': chats
        }

    train_dataset = train_dataset.map(lambda x: process_dataset(x, train_dataset['solution']), batched=True)
    pred_dataset = pred_dataset.map(lambda x: process_dataset(x), batched=True)

    eval_dataset = eval_dataset.map(lambda x: process_dataset(x, eval_dataset['solution']), batched=True)
    test_dataset = eval_dataset.train_test_split(test_size=0.3)

    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset['train'],
        'val': test_dataset['test'],
        'predict': pred_dataset
    })

    return dataset


In [14]:
dataset = prepare_dataset(tokenizer, fit_dataset=True)
dataset

Map:   0%|          | 0/430 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/459 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 430
    })
    test: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 321
    })
    val: Dataset({
        features: ['id', 'challenge', 'solution', 'texts', 'messages'],
        num_rows: 138
    })
    predict: Dataset({
        features: ['id', 'challenge', 'texts', 'messages'],
        num_rows: 112
    })
})

In [15]:
def gpu_stats(device_id=0):
    #@title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(device_id)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    return {'gpu': gpu_stats.name, 'max_memory': max_memory, 'start_gpu_memory': start_gpu_memory}

In [16]:
def parse_output(text):
    # Extract the content inside <output></output> tags
    output_match = re.search(r'<output>(.*?)</output>', text, re.DOTALL)
    if not output_match:
        return None
    
    output_content = output_match.group(1).strip()
    
    # Split the content into lines and convert each line to a list of single-digit integers
    try:
        grid = []
        for line in output_content.split('\n'):
            row = [int(char) for char in line.strip() if char.isdigit()]
            if row:
                grid.append(row)
        
        # Ensure all rows have the same length
        if grid and all(len(row) == len(grid[0]) for row in grid):
            return grid
        else:
            return None
    except ValueError:
        return None
    
def tensor_to_int(value):
    if isinstance(value, torch.Tensor):
        return tensor_to_int(value.item())
    elif isinstance(value, list):
        return [tensor_to_int(item) for item in value]
    else:
        return value
    
def calculate_partial_match(pred, label):
    if not isinstance(pred, list) or not isinstance(label, list):
        return 0  # No match if either is not a list

    if len(pred) != len(label):
        return 0  # No match if outer dimensions differ

    total_elements = 0
    correct_elements = 0

    for p_row, l_row in zip(pred, label):
        if not isinstance(p_row, list) or not isinstance(l_row, list) or len(p_row) != len(l_row):
            return 0  # No match if any row is not a list or dimensions differ

        total_elements += len(l_row)
        correct_elements += sum(p == l for p, l in zip(p_row, l_row))

    return correct_elements / total_elements if total_elements > 0 else 0

def calculate_metrics(preds, labels):
    total_samples = len(labels)
    
    correct = sum(1 for p, l in zip(preds, labels) if p == l)
    accuracy = correct / total_samples
    
    partial_match_scores = [calculate_partial_match(p, l) if p is not None else 0 for p, l in zip(preds, labels)]
    
    avg_partial_match = sum(partial_match_scores) / total_samples
    
    return accuracy, avg_partial_match

In [17]:
def collate(mode, tokenizer):
    def collate_fn(batch):
        # Separate the different components of the batch
        ids = [item['id'] for item in batch]
        challenges = [item['challenge'] for item in batch]
        
        # For 'test' mode, remove the last assistant message from each entry
        if mode == 'test':
            messages = [item['messages'][:-1] for item in batch] # last message is always assistant message - solution, we don't need it for evaluation
        else:
            messages = [item['messages'] for item in batch]
        
        # Tokenize the texts
        encodings = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            padding=True, 
            # truncation=True
        )
        
        # If 'solution' is present (for training/validation data)
        if 'solution' in batch[0]:
            solutions = [item['solution'] for item in batch]
            return {
                'id': ids,
                'challenge': challenges,
                'solution': solutions,
                'input_ids': encodings['input_ids'].to("cuda"),
                'attention_mask': encodings['attention_mask'].to("cuda")
            }
        else:
            return {
                'id': ids,
                'challenge': challenges,
                'input_ids': encodings['input_ids'].to("cuda"),
                'attention_mask': encodings['attention_mask'].to("cuda")
            }
    return collate_fn

In [18]:
def evaluate_batch(model, tokenizer, batch, num_seq=5):
    inputs = {
        'input_ids': batch['input_ids'],
        'attention_mask': batch['attention_mask']
    }

    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=MAX_NEW_TOKENS, 
            do_sample=True, 
            use_cache=True, 
            num_beams=5, 
            num_return_sequences=num_seq, 
            temperature=0.5, 
            top_k=50
        )

    input_ids_length = inputs['input_ids'].shape[1] # sequence length without new tokens
    new_tokens = outputs[:, input_ids_length:]
    
    generated_texts = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    
    return generated_texts

In [19]:
def process_sequences(generated_texts, num_seq):
    parsed_outputs = [parse_output(text) for text in generated_texts]
    res = []
    for i in range(0, len(parsed_outputs), num_seq):
        options = [opt for opt in parsed_outputs[i:i + num_seq] if opt is not None]
        if not options:
            res.append((None, None))
            continue
        
        # Group options by their structure (rows x columns)
        structure_groups = {}
        for option in options:
            rows = len(option)
            cols = len(option[0]) if rows > 0 else 0
            structure = (rows, cols)
            if structure not in structure_groups:
                structure_groups[structure] = []
            structure_groups[structure].append(option)
        
        # Select the group with the most options
        most_common_structure = max(structure_groups, key=lambda x: len(structure_groups[x]))
        selected_options = structure_groups[most_common_structure]
        
        # Get dimensions of the most common structure
        rows, cols = most_common_structure
        
        # Perform element-wise voting
        voted_option = [[None for _ in range(cols)] for _ in range(rows)]
        for row in range(rows):
            for col in range(cols):
                elements = [option[row][col] for option in selected_options]
                voted_option[row][col] = max(set(elements), key=elements.count)
        
        # Select the top 2 options based on similarity to the voted option
        def similarity_score(option):
            return sum(option[r][c] == voted_option[r][c] for r in range(rows) for c in range(cols))
        
        top_2_options = sorted(selected_options, key=similarity_score, reverse=True)[:2]
        res.append(tuple(top_2_options)) # TODO this or top2 + voted
    return res

In [20]:
def evaluate(model, tokenizer, dataset, batch_size, num_seq=5):
    eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate(mode='test', tokenizer=tokenizer))

    challenge_ids = []
    preds = []
    labels = []
    for i, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
        generated_texts = evaluate_batch(model, tokenizer, batch, num_seq=num_seq) # (batch_size * num_return_sequences, seq_len)

        # Ensure solutions is always a list
        ids = batch["id"]
        challenges = batch["challenge"]
        solutions = batch["solution"]
        
        processed_outputs = process_sequences(generated_texts, num_seq)

        # I don't like how complicated this is, but I don't see an easier way to do it right now
        for (parsed_output1, parsed_output2), label, challenge_id, challenge in zip(processed_outputs, solutions, ids, challenges):
            
            if parsed_output1 is None or parsed_output2 is None:
                preds.append(None)
            else:
                # Choose the best prediction based on partial match score
                score1 = calculate_partial_match(parsed_output1, tensor_to_int(label)) if parsed_output1 is not None else 0
                score2 = calculate_partial_match(parsed_output2, tensor_to_int(label)) if parsed_output2 is not None else 0
                best_pred = parsed_output1 if score1 >= score2 else parsed_output2
                preds.append(best_pred)
            
            labels.append(tensor_to_int(label))
            challenge_ids.append((challenge_id, challenge['order']))
        
        if i % 2 == 0 and i > 0:
            break

    return {
        'ids': challenge_ids,
        'preds': preds,
        'labels': labels,
    }

In [21]:
# results = evaluate(model, tokenizer, dataset['test'], batch_size=1)
# # Calculate metrics
# accuracy, avg_partial_match = calculate_metrics(results['preds'], results['labels'])

# log.info(f"Exact match accuracy: {accuracy:.4f}")
# log.info(f"Average partial match score: {avg_partial_match:.4f}")

In [22]:
inputs = tokenizer.apply_chat_template(dataset['test'][0]['messages'], tokenize=True, add_generation_prompt=True, return_tensors='pt', padding=False, return_dict=True)

In [23]:
# FastLanguageModel.for_inference(model)

In [24]:
with torch.no_grad():
    outputs = model.generate(
        **{
            'input_ids': inputs['input_ids'].to("cuda"),
            'attention_mask': inputs['attention_mask'].to("cuda")
        }, 
        max_new_tokens=MAX_NEW_TOKENS, 
        do_sample=False, 
        use_cache=True, 
        num_beams=5, 
        num_return_sequences=5,
        temperature=0.5, 
        top_k=50,
        top_p=0.2
    )



In [25]:
for output in outputs:
    print(tokenizer.decode(output, skip_special_tokens=True))
    print('-' * 100)

user
You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.

Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
-----------------
<input>
0000000000000000000000
0000002000000000000000
0020002320000000000000
0272000020000000000000
0020000000000000000000
0000020000111111111110
0000242000111111111110
0000022000111111113110
0000000000111111111110
0222000000111111111110
0282000000111141111110
0202000000111111111110
0000000000111111111110
0000000000111111111110
0000000000118111171110
0000000000111111111110
0000000000111111111110
0000000000000000000000
</input>

<output>
11111111111
11111112111
11111112321
11111111121
11112111111
11124211111
11112211111
11111111111
12221112111
12821127211
12121112111
11111111111
</output>

<input>
00000000000000000000
00000000000000000000
00000000000200000000
00000000002120000000
0000000000200