Load model from Lora configs saved in HF repo

In [1]:
from unsloth import FastLanguageModel

def load_model(model_name, max_seq_length, dtype, load_in_4bit, output_hidden_states=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        output_hidden_states=output_hidden_states,
    )
    return model, tokenizer

LLAMA = {
    "src_model": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "finetuned_model": "saisasanky/Llama-3.1-8B-Instruct-bnb-4bit-aish",
    "gguf_model": "saisasanky/Llama-3.1-8B-Instruct-bnb-4bit-aish-gguf",
    "chat_template": "llama-3.1",
    "instruction_template": "<|start_header_id|>user<|end_header_id|>\n\n",
    "response_template": "<|start_header_id|>assistant<|end_header_id|>\n\n",
}
QWEN = {
    "src_model": "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit",
    "finetuned_model": "saisasanky/Qwen2.5-Coder-14B-Instruct-bnb-4bit-aish",
    "gguf_model":  "saisasanky/Qwen2.5-Coder-14B-Instruct-bnb-4bit-aish-gguf",
    "chat_template": "qwen-2.5",
    "instruction_template": "<|im_start|>user\n",
    "response_template": "<|im_start|>assistant\n",
}

model_config = LLAMA

model, tokenizer = load_model(
    model_name=model_config["finetuned_model"],
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Standard import failed for UnslothRewardTrainer: No module named 'UnslothRewardTrainer'. Using tempfile instead!
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA RTX A5000. Num GPUs = 1. Max memory: 23.573 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
model = FastLanguageModel.for_inference(model)

Load test dataset from HF

In [3]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = model_config["chat_template"],
)

from datasets import load_dataset
dataset = load_dataset("westenfelder/NL2SH-ALFA", "test", split = "train")

def formatting_prompts_func(examples):
    nls = examples['nl']
    messages_wo_completion = [
        [
            {"role": "system", "content": "You are an assistant that provides exact bash command for given input"},
            {"role": "user", "content": nl}
        ]
        for nl in nls
    ]
    return {
        'messages_witout_completion': messages_wo_completion,
    }

dataset = dataset.map(formatting_prompts_func, batched=True)
dataset[0]

{'nl': 'list files in the current directory',
 'bash': 'ls',
 'bash2': 'ls -l',
 'difficulty': 0,
 'messages_witout_completion': [{'content': 'You are an assistant that provides exact bash command for given input',
   'role': 'system'},
  {'content': 'list files in the current directory', 'role': 'user'}]}

In [4]:
from tqdm import tqdm

In [5]:
def inference_aish(
    model,
    tokenizer,
    messages
):
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors = "pt",
        padding=True,
        padding_side='left',
    ).to("cuda")
    input_length = inputs.shape[1]
    n_examples = inputs.shape[0]
    batch_size = 32
    completions = []
    for i in tqdm(range(0, len(inputs), batch_size)):
        j = min(i + batch_size, n_examples)
        outputs = model.generate(
            input_ids = inputs[i:j],
            max_new_tokens = 128,
            use_cache = True,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
            bos_token_id = tokenizer.bos_token_id,
        )

        # Extract generated part (exclude the prompt)
        generated_tokens = outputs[:, input_length:]

        completions.extend(
            tokenizer.batch_decode(
                generated_tokens,
                skip_special_tokens=True
            )
        )

    return completions


messages = [example['messages_witout_completion'] for example in dataset]
print(len(messages))
completions = inference_aish(model, tokenizer, messages)

300


  0%|          | 0/10 [00:00<?, ?it/s]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [02:00<00:00, 12.05s/it]


In [6]:
import json

def save_completions_to_jsonl(completions, output_file):
    with open(output_file, 'w') as f:
        for completion in completions:
            json.dump({"completion": completion}, f)
            f.write('\n')

save_completions_to_jsonl(completions, "completions.jsonl")

In [7]:
n_exact_matches = 0
for completion, example in zip(completions, dataset):
  if completion == example['bash'] or completion == example['bash2']:
    n_exact_matches += 1
  else:
    print("LLM: ", completion)
    print("Human ans: ", example['bash'])
    print("------------------")

LLM:  find. -type f -name '*'
Human ans:  ls
------------------
LLM:  find. -name '*' -print
Human ans:  ls -a
------------------
LLM:  sudo mkdir /testbed/test_dir
Human ans:  mkdir /testbed/test_dir
------------------
LLM:  echo 'hello world'
Human ans:  echo hello world
------------------
LLM:  echo $USER
Human ans:  whoami
------------------
LLM:  rm -rf fake_dir
Human ans:  rmdir fake_dir
------------------
LLM:  cat setup_nl2b_fs_1.sh | head -n 5
Human ans:  head -n 5 setup_nl2b_fs_1.sh
------------------
LLM:  tail -5 setup_nl2b_fs_1.sh
Human ans:  tail -n 5 setup_nl2b_fs_1.sh
------------------
LLM:  sed -n '10p' setup_nl2b_fs_1.sh
Human ans:  sed -n 10p setup_nl2b_fs_1.sh
------------------
LLM:  echo $(which bash)
Human ans:  which bash
------------------
LLM:  sar -u
Human ans:  vmstat
------------------
LLM:  sysctl -a | grep'sysctl.net.ipv4.tcp_available_congestion_control'
Human ans:  w
------------------
LLM:  htop
Human ans:  ps
------------------
LLM:  htop
Human ans: 

In [8]:
n_exact_matches

53