In [None]:
v!pip install bitsandbytes==0.45.4 boto3==1.37.0 datasets==3.3.2

In [None]:
from datetime import datetime
import getpass
import json
import os
from os import path
import random
import re
import shutil
import string
import typing

import boto3
import datasets
import huggingface_hub
import torch
import transformers
from transformers import logging, AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


if not path.exists('data'):
    os.mkdir("data")


logging.set_verbosity_error()

In [None]:
AWS_ACCESS_KEY_ID = getpass.getpass("Enter your AWS_ACCESS_KEY_ID: ")
AWS_SECRET_ACCESS_KEY = getpass.getpass("Enter your AWS_SECRET_ACCESS_KEY: ")

os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY


S3_BUCKET_NAME = "data-science-talks"
S3_CLIENT = boto3.client("s3")

In [None]:
huggingface_hub.login()

In [None]:
def _random_three_alphanumeric():
    # 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
    chars = string.ascii_letters + string.digits
    return ''.join(random.choices(chars, k=3))
    

def _get_last_processed_record(
    prefix: str = "gsm8k_llama_7b_100_record_test_",
    bucket_name: str = S3_BUCKET_NAME
) -> int | None:
    """
    Lists all objects in the given S3 bucket whose keys match `prefix + <number>.json`.
    Returns the largest integer <number> found, or None if none are found.

    Args:
        prefix (str): The prefix to filter by, e.g. "gsm8k_llama_7b_100_record_test_"
        bucket_name (str): The name of the S3 bucket.

    Returns:
        int | None: The largest integer found in matching S3 keys, or None if no matches exist.
    """

    last_processed = None
    continuation_token = None

    # We'll build a regex pattern from your prefix, e.g. r'^<prefix>(\d+)\.json$'
    pattern = rf"{re.escape(prefix)}(\d+)\.json$"

    while True:
        if continuation_token:
            response = S3_CLIENT.list_objects_v2(
                Bucket=bucket_name,
                Prefix=prefix,
                ContinuationToken=continuation_token
            )
        else:
            response = S3_CLIENT.list_objects_v2(
                Bucket=bucket_name,
                Prefix=prefix
            )

        contents = response.get("Contents", [])
        for obj in contents:
            key = obj["Key"]  # e.g. gsm8k_llama_7b_100_record_test_13.json
            m = re.search(pattern, key)
            if m:
                record_num = int(m.group(1))
                if last_processed is None or record_num > last_processed:
                    last_processed = record_num

        # Check if more pages remain
        if response.get("IsTruncated"):
            continuation_token = response.get("NextContinuationToken")
        else:
            break

    return last_processed

In [None]:
def download_object_from_s3(object_key: str, bucket_name: str = S3_BUCKET_NAME) -> None:
    object_name = f"{object_key}.json"
    obj = S3_CLIENT.get_object(Bucket=bucket_name, Key=object_name)
    print(f"Reading {object_name} into Python from S3")
    data_bytes = obj["Body"].read()  # Read the raw bytes from S3
    return json.loads(data_bytes)  # Parse directly as JSON


def download_and_display_object_from_s3(object_key: str, bucket_name: str = S3_BUCKET_NAME) -> None:
    json_object = download_object_from_s3(object_key, bucket_name)
    print("-" * 40)
    print(json.dumps(json_object, indent=2))
    return json_object


def download_and_unzip_checkpoint(
    s3_prefix: str = "gsm8k_lora_checkpoints",
    step: int = 1950,
    local_dir: str = "resume_checkpoint",
    bucket_name: str = S3_BUCKET_NAME,
):
    """
    Download `checkpoint_<step>.zip` from s3://BUCKET/s3_prefix, unzip it into
    `local_dir/checkpoint-<step>`, and return that path.
    """
    # 1) Find the checkpoint_XXXX.zip file for the requested step
    checkpoint_zip = f"checkpoint_{step}.zip"
    # e.g. "gsm8k_lora_checkpoints/checkpoint_1950.zip"
    s3_key = f"{s3_prefix}/{checkpoint_zip}"

    # 2) Download it
    local_zip = f"latest_checkpoint.zip"
    print(f"Downloading s3://{bucket_name}/{s3_key} to {local_zip} ...")
    S3_CLIENT.download_file(Bucket=bucket_name, Key=s3_key, Filename=local_zip)

    # 3) Unzip into local_dir/checkpoint-<step>
    ckpt_dir = path.join(local_dir, f"checkpoint-{step}")
    if path.exists(ckpt_dir):
        shutil.rmtree(ckpt_dir)
    os.makedirs(ckpt_dir, exist_ok=True)

    print(f"Unzipping {local_zip} into {ckpt_dir}...")
    shutil.unpack_archive(local_zip, ckpt_dir)
    os.remove(local_zip)

    print(f"Done. Checkpoint is in {ckpt_dir}")
    return ckpt_dir


In [None]:
S3_PREFIX_DIR = 'gsm8k_lora_reward_fixes_3_lora_64_checkpoints'

resume_ckpt_dir = download_and_unzip_checkpoint(
    s3_prefix=S3_PREFIX_DIR, step=800, local_dir="resume_checkpoint"
)

## Generating answers with fine-tuned model

In [None]:
# 1) Regular 4-bit load
quant_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/meta-Llama-3.1-8B-Instruct",
    quantization_config=quant_config,
    device_map="auto",  # or "cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/meta-Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

# 2) Load your LoRA
lora_model_trained = PeftModel.from_pretrained(base_model, resume_ckpt_dir)
lora_model_trained.eval()
print("LoRA weights successfully loaded into lora_model_trained!")

In [None]:
def _is_valid_number(candidate: str) -> bool:
    """
    Check if 'candidate' matches one of the patterns:
    - plain integer (e.g. '39')
    - dollar + integer (e.g. '$39')
    - comma-separated integer (e.g. '43,500')
    """
    candidate = candidate.strip()
    # e.g. remove leading/trailing '$', but handle the case if it starts with $
    # We'll use a regex approach to handle commas or $ sign:
    pattern = r"^\$?\d{1,3}(,\d{3})*(\.\d+)?$"
    return bool(re.match(pattern, candidate))


def generate_answers(
    model,
    tokenizer,
    system_prompt: str,
    dataset_subset,
    s3_prefix: str,
    num_attempts: int = 5,
    max_length_prompt: int = 256,
    max_new_tokens: int = 256,
    start_index: int = 0,
    end_index: typing.Optional[int] = None,
):
    """
    Generate multiple answers for each sample in a dataset subset using a specified model,
    and upload the results to S3.

    Args:
        model: The model to use for generation (e.g., baseline model or fine-tuned LoRA model).
        tokenizer: The tokenizer corresponding to the model.
        system_prompt: The system prompt to prepend to each question.
        dataset_subset: The dataset subset to process (e.g., 100 shuffled GSM8K test samples).
        s3_prefix: Prefix for S3 object keys (e.g., 'gsm8k_llama_7b_100_record_test_').
        num_attempts: Number of answer attempts per sample (default: 5).
        max_length_prompt: Maximum token length for the input prompt (default: 256).
        max_new_tokens: Maximum new tokens to generate (default: 256).
        start_index: Index to start processing from (default: 0, useful for resuming).
    """
    # Set model to evaluation mode
    model.eval()

    # Move model to CUDA
    model.to("cuda")

    # Handle index bounds
    dataset_len = len(dataset_subset)
    start_index = max(0, start_index)
    end_index = dataset_len if end_index is None else min(dataset_len, end_index)

    # Process each sample in the specified range
    for i in range(start_index, end_index):
        sample = dataset_subset[i]

        # Extract question and gold answer from the sample
        question_text = sample["question"]
        gold_raw = sample["answer"]
        gold_answer = _extract_hash_answer(gold_raw)

        # Construct the prompt based on use_chat_template
        messages = [
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': question_text},
        ]
        full_prompt_text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Tokenize the prompt
        inputs = tokenizer(
            full_prompt_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length_prompt,
        ).to("cuda")

        # Get the length of the prompt tokens
        prompt_len = inputs["input_ids"].shape[1]

        # Generate five answers for the sample
        sampled_answers = []
        for attempt in range(num_attempts):
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=0.8,
                    top_p=0.95,
                )

            # Extract newly generated tokens (after the prompt)
            gen_tokens = outputs[0, prompt_len:]
            completion_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)

            # Extract and validate the predicted answer
            predicted_answer = extract_xml_answer(completion_text)
            answer_is_valid = _is_valid_number(predicted_answer)
            if answer_is_valid:
                pred_clean = predicted_answer.replace("$", "").replace(",", "").strip()
                gold_clean = gold_answer.replace("$", "").replace(",", "").strip()
                correct = pred_clean == gold_clean
            else:
                correct = False

            # Store attempt details
            sampled_answers.append(
                {
                    "raw_text": completion_text,
                    "predicted_answer": predicted_answer,
                    "valid_number": answer_is_valid,
                    "correct": correct,
                }
            )

        # Compile the record for this sample
        record = {
            "index": i,
            "question": question_text,
            "gold_answer": gold_answer,
            "model_outputs": sampled_answers,
        }

        now = datetime.now()

        timestamp_str = now.strftime("%b-%d-%Y_%I-%M-%S%p")

        filename = f"{_random_three_alphanumeric()}_{timestamp_str}.json"
        # Upload to S3 with a 1-based index in the key
        upload_jsonable_object_to_s3(
            record, f'{s3_prefix}{i + 1}', temp_filepath=path.join("data", filename)
        )


In [None]:
gsm8k_data = datasets.load_dataset('openai/gsm8k', 'main')
test_set = gsm8k_data['test']
test_set_shuffled = test_set.shuffle(seed=250217)

SYSTEM_PROMPT = """
### EXAMPLE ###
Q: 3+2
<reasoning>
3 plus 2 is 5
</reasoning>
<answer>
5
</answer>

Now follow the same format EXACTLY for each question:

<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

S3_FINE_TUNE_PREFIX = 'gsm8k_lora_reward_fixes_3_lora_64_checkpoints_post_finetune_800_2_'

generate_answers(
    model=lora_model_trained,
    tokenizer=tokenizer,
    system_prompt=SYSTEM_PROMPT,
    dataset_subset=test_set_shuffled,
    s3_prefix=S3_FINE_TUNE_PREFIX,
    start_index=60,
    end_index=80,
)

## Evaluating results

In [None]:
PREFIX_NEW = "gsm8k_llama_7b_100_record_test_new_prompt_"

In [None]:
IND = 70
record_tmp = download_and_display_object_from_s3(f'{PREFIX_NEW}{IND}')

In [None]:
record_tmp = download_and_display_object_from_s3(f'{S3_FINE_TUNE_PREFIX}{IND}')