In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers datasets accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m129.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

# Constants - adjust these
GEMMA_DIR = "/content/drive/MyDrive/NLP_ZOFIA/gemma_fine_tuned/full"
MODEL_DIR = "/content/drive/MyDrive/NLP_ZOFIA/tmp/llama_finetunned"
TOKENIZER_DIR = GEMMA_DIR # Or MODEL_DIR depending on run  # Assuming tokenizer is stored with model
LOG_DIR = "/content/drive/MyDrive/NLP_ZOFIA/eval_logs"
os.makedirs(LOG_DIR, exist_ok=True)

HF_TOKEN =  "redacted" #os.getenv("HF_TOKEN")  # or hardcode if needed (not recommended)
EVAL_SPLIT = "test"
MAX_NEW_TOKENS = 200
LLAMA_MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
GEMMA_MODEL_NAME ="google/gemma-2b"
IS_FINE_TUNED = True
LOAD_IN_8BIT = True
START_INDEX = 300
END_INDEX = 700

In [None]:
def remove_special_characters(text):
    return text.replace("_", " ").replace("\n", " ")

def format_example_initial(example):
    return {
        "text": remove_special_characters(example["question"]),
        "out": remove_special_characters(example["answer"]),
    }

def get_CLRS_dataset(split="train"):
  if split == "train":
      return load_dataset("tomg-group-umd/CLRS-Text-train", split="train")
  else:
      return load_dataset("tomg-group-umd/CLRS-Text-test", split=split)

In [None]:
def load_tokenizer():
    if IS_FINE_TUNED:
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
    else:
        tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME, use_auth_token=HF_TOKEN)


    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    if tokenizer.chat_template is None:
        tokenizer.chat_template = (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}"
            "{{ '<|start_header_id|>user<|end_header_id|>\n' + message['content'] + '<|eot_id|>' }}"
            "{% elif message['role'] == 'assistant' %}"
            "{{ '<|start_header_id|>assistant<|end_header_id|>\n' + message['content'] + '<|eot_id|>' }}"
            "{% endif %}"
            "{% endfor %}"
        )
    return tokenizer

def load_model():
    model_source = GEMMA_DIR if IS_FINE_TUNED else LLAMA_MODEL_NAME
    print(model_source)

    quant_config = BitsAndBytesConfig(load_in_8bit=True) if LOAD_IN_8BIT else None

    model = AutoModelForCausalLM.from_pretrained(
        model_source,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        quantization_config=quant_config,
        use_auth_token=None if IS_FINE_TUNED else HF_TOKEN,
    )
    return model


In [None]:
@torch.no_grad()
def evaluate(model, tokenizer, dataset, max_new_tokens=200, log_file_path=None):
    model.eval()
    correct = 0
    total = 0
    logs = []

    first_device = next(model.parameters()).device

    for example in tqdm(dataset):
        question = example["text"]
        expected_answer = example["out"]

        messages = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": ""}
        ]

        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        input_tokens = tokenizer(prompt, return_tensors="pt", padding=True)
        input_tokens = {k: v.to(first_device) for k, v in input_tokens.items()}

        output_ids = model.generate(
            input_ids=input_tokens["input_ids"],
            attention_mask=input_tokens["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

        decoded_full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        decoded_prompt = tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True)
        predicted_answer = decoded_full_output[len(decoded_prompt):].strip()

        log_entry = f"\n---\nUser: {question}\nPredicted: {predicted_answer}\nExpected: {expected_answer}\n---\n"
        logs.append(" ")
        logs.append(log_entry)

        if expected_answer.strip().lower() == predicted_answer.strip().lower():
            correct += 1
        total += 1

        if total % 50 == 0:
            checkpoint_log = f"-------------------CHECKPOINT-------------------\nLocal accuracy: {correct / total:.4f}, total = {total}\n"
            print(checkpoint_log)
            logs.append(checkpoint_log)

    accuracy = correct / total if total > 0 else 0.0
    final_log = f"\nFinal accuracy: {accuracy:.4f}"
    print(final_log)
    logs.append(final_log)

    if log_file_path:
        with open(log_file_path, "w") as f:
            f.writelines(logs)
        print(f"Logs saved to {log_file_path}")


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from peft import PeftModel

# === Load tokenizer ===
print("Loading tokenizer...")
tokenizer = load_tokenizer()

# === Load base model ===
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(GEMMA_MODEL_NAME)

# === Resize embeddings to match tokenizer size (fixes vocab mismatch) ===
model.resize_token_embeddings(len(tokenizer))

# === Load LoRA adapter ===
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(model, GEMMA_DIR)

# === Merge LoRA weights into the base model (optional for inference) ===
model = model.merge_and_unload()

print("Model + LoRA loaded successfully.")
print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:    ", model.get_input_embeddings().weight.shape[0])

Loading tokenizer...
Loading base model...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loading LoRA adapter...




Model + LoRA loaded successfully.
Tokenizer vocab size: 256132
Model vocab size:     256132


In [None]:
DATASET_CACHE_DIR = "/content/drive/NLP_Zofia/cached_dataset"

In [None]:
import os

os.environ["HF_DATASETS_CACHE"] = DATASET_CACHE_DIR


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/NLP_ZOFIA/cached_dataset/test_2-00000-of-00001.parquet'
df = pd.read_parquet(file_path)

# Display the first few rows
df.head()

Unnamed: 0,question,answer,algo_name,length
0,dfs:\nA: [[0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1...,[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...,dfs,25
1,naive_string_matcher:\nstring: [0 0 0 0 0 0 0 ...,"1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3 | 3\n\n",naive_string_matcher,40
2,find_maximum_subarray_kadane:\nkey: [0.123 0.1...,"(0, 1), (0, 1), (0, 1), (0, 1), (5, 5), (5, 5)...",find_maximum_subarray_kadane,30
3,kmp_matcher:\nstring: [0 0 0 0 0 0 0 0 0 0 0 0...,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",kmp_matcher,49
4,task_scheduling:\nd: [9 4 2 22 10 18 8 24 6 21...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0...,task_scheduling,26


In [None]:
# Step 1: Slice the DataFrame (rows 300 to 1300 => indices 299 to 1299)
subset_df = df.iloc[START_INDEX:END_INDEX]

# Step 2: Convert to list of dicts
raw_examples = subset_df.to_dict(orient="records")

# Step 3: Apply your formatting function
formatted_dataset = [format_example_initial(example) for example in raw_examples]



In [None]:
log_file_path = os.path.join(LOG_DIR, f"eval_logs_gemma_default_tokenizer_{START_INDEX}_{END_INDEX}.txt")


In [None]:
# Step 4: Evaluate using the formatted data
evaluate(model, tokenizer, formatted_dataset, max_new_tokens=MAX_NEW_TOKENS, log_file_path=log_file_path)

 33%|███▎      | 33/100 [30:10<59:48, 53.56s/it]  This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
 50%|█████     | 50/100 [47:09<43:33, 52.27s/it]

-------------------CHECKPOINT-------------------
Local accuracy: 0.0000, total = 50



100%|██████████| 100/100 [1:43:15<00:00, 61.95s/it]

-------------------CHECKPOINT-------------------
Local accuracy: 0.0000, total = 100


Final accuracy: 0.0000
Logs saved to /content/drive/MyDrive/NLP_ZOFIA/eval_logs/eval_logs_gemma_default_tokenizer_300_400.txt



