In [1]:
!pip install transformers datasets trl peft bitsandbytes accelerate torch sentencepiece huggingface_hub -U
!pip install --upgrade trl transformers accelerate
!pip install --upgrade deepspeed==0.14.4

# ATTENTION: Be sure to restart the notebook after installing all packages!

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.me

In [1]:
from huggingface_hub import interpreter_login
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import warnings
from IPython.display import display, Markdown

# Suppress warnings if needed
warnings.filterwarnings("ignore")

# --- Configuration ---
base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
# *** IMPORTANT: Update this path to where your LoRA adapter weights are saved ***
adapter_path = "./results_llama3_finetuned"
# Set device (use "cuda" if GPU is available, otherwise "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Optional: Quantization config for loading base model (if needed for memory)
# Adjust based on your inference hardware and how you trained/saved.
# If you merge first, you might load the base without quantization here.
use_quantization = False # Set to True if you want to load base model in 4-bit
compute_dtype = torch.bfloat16 # Or torch.float16

bnb_config = None
if use_quantization:
    print("Loading base model with 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False, # Typically False for inference
    )
else:
    print(f"Loading base model with dtype: {compute_dtype}")


# --- Load Base Model and Tokenizer ---
print(f"Loading base model: {base_model_id}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype if not use_quantization else None, # Set dtype if not quantizing
    device_map="auto", # Automatically handle device placement
    trust_remote_code=True,
)

print(f"Loading tokenizer for: {base_model_id}")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Set pad token if needed
tokenizer.padding_side = "right"

# --- Load LoRA Adapter and Merge ---
print(f"Loading LoRA adapter from: {adapter_path}")
# Load the LoRA adapter weights on top of the base model
# Note: device_map="auto" should handle placing the adapter correctly
model = PeftModel.from_pretrained(model, adapter_path)
print("LoRA adapter loaded.")

print("Merging LoRA adapter into the base model...")
# Merge the adapter weights into the base model weights
# This returns a standard transformers model object
model = model.merge_and_unload()
print("Adapter merged.")

# Set the model to evaluation mode
model.eval()

def get_response(prompt_text):
    # --- Prepare Prompt using the Correct Chat Template ---
    # *** CRITICAL: Use the exact same format as your training data ***
    # Example Llama 3.2 Instruct format (VERIFY THIS STRUCTURE!)
    # prompt_text = "Investigating the effects of manipulation on the gale shapley algorithm."

    # Format the prompt using the chat template structure
    # Ensure this matches the format_dataset_batch function from your training script
    formatted_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

    {prompt_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

    """ # Note the trailing space and newline for the assistant's turn

    print("\nFormatted Prompt:")
    print(formatted_prompt)

    # --- Tokenize Input ---
    print("Tokenizing prompt...")
    # Encode the formatted prompt into token IDs
    # Ensure tensors are moved to the same device as the model
    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=False, truncation=True).to(device)

    # --- Generate Output ---
    print("Generating response...")
    # Generate text using the model
    # Adjust generation parameters as needed (max_new_tokens, temperature, top_p, etc.)
    with torch.no_grad(): # Disable gradient calculations for inference
        outputs = model.generate(
            **inputs,
            max_new_tokens=2048,          # Maximum number of new tokens to generate
            do_sample=True,             # Whether to use sampling; set to False for deterministic output
            temperature=0.6,            # Controls randomness (lower = more deterministic)
            top_p=0.9,                  # Nucleus sampling probability
            eos_token_id=tokenizer.eos_token_id # Stop generation when EOS token is encountered
        )

    # --- Decode and Print Output ---
    # Decode the generated token IDs back into text
    # Skip special tokens (like padding or EOS) in the output string
    # We slice the output to only decode the newly generated tokens (after the input prompt)
    generated_token_ids = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

    return response

# response = get_response("Investigating the effects of manipulation on the gale shapley algorithm.")
# div = "="*80
# print(f"\n{div}\nModel Response:")
# display(Markdown(response))


# print(response)

# print("\nInference complete.")



  warn(
2025-04-23 16:45:30.018161: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-23 16:45:30.018224: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-23 16:45:30.019583: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-23 16:45:30.026526: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2025-04-23 16:45:31,565] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  @autocast_custom_fwd
  @autocast_custom_bwd


Using device: cuda
Loading base model with dtype: torch.bfloat16
Loading base model: meta-llama/Llama-3.2-3B-Instruct


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading tokenizer for: meta-llama/Llama-3.2-3B-Instruct
Loading LoRA adapter from: ./results_llama3_finetuned
LoRA adapter loaded.
Merging LoRA adapter into the base model...
Adapter merged.


In [None]:
import pandas as pd
from tqdm import tqdm

test_data = pd.read_csv('test_questions.csv')
prompts = test_data['prompt'].tolist()
responses = []

for prompt in tqdm(prompts):
    response = get_response(prompt)
    responses.append(response)

# div = "="*80
# print(f"\n{div}\nModel Response:")
# display(Markdown(response))

test_data.insert(1, 'finetune_response', responses)

In [None]:
test_data = test_data[['topic', 'prompt', 'completion', 'finetune_response']]

test_data.to_csv('eval_data.csv', index=False)


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import warnings
from IPython.display import display, Markdown

# Suppress warnings if needed
warnings.filterwarnings("ignore")

# --- Configuration ---
base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
# *** IMPORTANT: Update this path to where your LoRA adapter weights are saved ***
adapter_path = "./results_llama3_finetuned"
# Set device (use "cuda" if GPU is available, otherwise "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Optional: Quantization config for loading base model (if needed for memory)
# Adjust based on your inference hardware and how you trained/saved.
# If you merge first, you might load the base without quantization here.
use_quantization = False # Set to True if you want to load base model in 4-bit
compute_dtype = torch.bfloat16 # Or torch.float16

bnb_config = None
if use_quantization:
    print("Loading base model with 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False, # Typically False for inference
    )
else:
    print(f"Loading base model with dtype: {compute_dtype}")


# --- Load Base Model and Tokenizer ---
print(f"Loading base model: {base_model_id}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype if not use_quantization else None, # Set dtype if not quantizing
    device_map="auto", # Automatically handle device placement
    trust_remote_code=True,
)

print(f"Loading tokenizer for: {base_model_id}")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Set pad token if needed
tokenizer.padding_side = "right"

# --- Load LoRA Adapter and Merge ---
# print(f"Loading LoRA adapter from: {adapter_path}")
# Load the LoRA adapter weights on top of the base model
# Note: device_map="auto" should handle placing the adapter correctly
# model = PeftModel.from_pretrained(model, adapter_path)
# print("LoRA adapter loaded.")

# print("Merging LoRA adapter into the base model...")
# Merge the adapter weights into the base model weights
# This returns a standard transformers model object
# model = model.merge_and_unload()
# print("Adapter merged.")

# Set the model to evaluation mode
model.eval()

def get_response(prompt_text):
    # --- Prepare Prompt using the Correct Chat Template ---
    # *** CRITICAL: Use the exact same format as your training data ***
    # Example Llama 3.2 Instruct format (VERIFY THIS STRUCTURE!)
    # prompt_text = "Investigating the effects of manipulation on the gale shapley algorithm."

    # Format the prompt using the chat template structure
    # Ensure this matches the format_dataset_batch function from your training script
    formatted_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

    {prompt_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

    """ # Note the trailing space and newline for the assistant's turn

    # print("\nFormatted Prompt:")
    # print(formatted_prompt)

    # --- Tokenize Input ---
    # print("Tokenizing prompt...")
    # Encode the formatted prompt into token IDs
    # Ensure tensors are moved to the same device as the model
    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=False, truncation=True).to(device)

    # --- Generate Output ---
    # print("Generating response...")
    # Generate text using the model
    # Adjust generation parameters as needed (max_new_tokens, temperature, top_p, etc.)
    with torch.no_grad(): # Disable gradient calculations for inference
        outputs = model.generate(
            **inputs,
            max_new_tokens=2048,          # Maximum number of new tokens to generate
            do_sample=True,             # Whether to use sampling; set to False for deterministic output
            temperature=0.6,            # Controls randomness (lower = more deterministic)
            top_p=0.9,                  # Nucleus sampling probability
            eos_token_id=tokenizer.eos_token_id # Stop generation when EOS token is encountered
        )

    # --- Decode and Print Output ---
    # Decode the generated token IDs back into text
    # Skip special tokens (like padding or EOS) in the output string
    # We slice the output to only decode the newly generated tokens (after the input prompt)
    generated_token_ids = outputs[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

    return response

# response = get_response("Investigating the effects of manipulation on the gale shapley algorithm.")
# div = "="*80
# print(f"\n{div}\nModel Response:")
# display(Markdown(response))


# print(response)

# print("\nInference complete.")



  warn(
2025-04-23 19:16:58.997342: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-23 19:16:58.997395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-23 19:16:58.998774: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-23 19:16:59.005343: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2025-04-23 19:17:00,557] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  @autocast_custom_fwd
  @autocast_custom_bwd


Using device: cuda
Loading base model with dtype: torch.bfloat16
Loading base model: meta-llama/Llama-3.2-3B-Instruct


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading tokenizer for: meta-llama/Llama-3.2-3B-Instruct


In [None]:
import pandas as pd
from tqdm.notebook import tqdm

test_data = pd.read_csv('eval_data.csv')
prompts = test_data['prompt'].tolist()
base_responses = []

for prompt in tqdm(prompts):
    response = get_response(prompt)
    base_responses.append(response)

# div = "="*80
# print(f"\n{div}\nModel Response:")
# display(Markdown(response))

test_data.insert(-1, 'base_response', base_responses)

  0%|          | 0/122 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

ValueError: unbounded slice