In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

Load model from Lora configs saved in HF repo

In [2]:
from unsloth import FastLanguageModel

def load_model(model_name, max_seq_length, dtype, load_in_4bit, output_hidden_states=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        output_hidden_states=output_hidden_states,
    )
    return model, tokenizer

model, tokenizer = load_model(
    model_name="saisasanky/Llama-3.1-8B-Instruct-4bit-aish",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

Load test dataset from HF

In [9]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

from datasets import load_dataset
dataset = load_dataset("westenfelder/NL2SH-ALFA", "test", split = "train")

def formatting_prompts_func(examples):
    nls = examples['nl']
    messages_wo_completion = [
        [
            {"role": "system", "content": "You are an assistant that provides exact bash command for given input"},
            {"role": "user", "content": nl}
        ]
        for nl in nls
    ]
    return {
        'messages_witout_completion': messages_wo_completion,
    }

dataset = dataset.map(formatting_prompts_func, batched=True)
# dataset.drop(columns=['nl','bash'])
dataset[0]
# dataset[0]['messages_with_cmd']

{'nl': 'list files in the current directory',
 'bash': 'ls',
 'bash2': 'ls -l',
 'difficulty': 0,
 'messages_witout_completion': [{'content': 'You are an assistant that provides exact bash command for given input',
   'role': 'system'},
  {'content': 'list files in the current directory', 'role': 'user'}],
 'messages_with_completion': [{'content': 'You are an assistant that provides exact bash command for given input',
   'role': 'system'},
  {'content': 'list files in the current directory', 'role': 'user'},
  {'content': 'ls', 'role': 'assistant'}]}

In [10]:
from tqdm import tqdm

In [11]:
def inference_aish(
    model,
    tokenizer,
    messages
):
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors = "pt",
        padding=True,
        padding_side='left',
    ).to("cuda")
    input_length = inputs.shape[1]
    n_examples = inputs.shape[0]
    batch_size = 32
    completions = []
    for i in tqdm(range(0, len(inputs), batch_size)):
        j = min(i + batch_size, n_examples)
        outputs = model.generate(
            input_ids = inputs[i:j],
            max_new_tokens = 128,
            use_cache = True,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
            bos_token_id = tokenizer.bos_token_id,
        )

        # Step 3: Extract generated part (exclude the prompt)
        generated_tokens = outputs[:, input_length:]

        completions.extend(
            tokenizer.batch_decode(
                generated_tokens,
                skip_special_tokens=True
            )
        )

    return completions


messages = [example['messages_witout_completion'] for example in dataset]
print(len(messages))
completions = inference_aish(model, tokenizer, messages)

300


100%|██████████| 10/10 [10:09<00:00, 60.99s/it]


In [21]:
import json

def save_completions_to_jsonl(completions, output_file):
    with open(output_file, 'w') as f:
        for completion in completions:
            json.dump({"completion": completion}, f)
            f.write('\n')

save_completions_to_jsonl(completions, "completions.jsonl")

In [19]:
n_exact_matches = 0
for completion, example in zip(completions, dataset):
  if completion == example['bash'] or completion == example['bash2']:
    n_exact_matches += 1
  else:
    print("LLM: ", completion)
    print("Human ans: ", example['bash'])
    print("------------------")

LLM:  lsof -i :80
Human ans:  lsof
------------------
LLM:  mkdir -p /testbed/test_dir
Human ans:  mkdir /testbed/test_dir
------------------
LLM:  date "+%Y-%m-%d %H:%M:%S"
Human ans:  date
------------------
LLM:  rm -rf fake_dir
Human ans:  rmdir fake_dir
------------------
LLM:  printenv | sort
Human ans:  env
------------------
LLM:  echo ~$USER
Human ans:  echo $HOME
------------------
LLM:  print $HOME
Human ans:  echo $PATH
------------------
LLM:  cat /home/user/Downloads/setup_nl2b_fs_1.sh
Human ans:  cat setup_nl2b_fs_1.sh
------------------
LLM:  nl2b_fs_1.sh|head -5
Human ans:  head -n 5 setup_nl2b_fs_1.sh
------------------
LLM:  tail -5 setup_nl2b_fs_1.sh
Human ans:  tail -n 5 setup_nl2b_fs_1.sh
------------------
LLM:  setup_nl2b_fs_1.sh: line 10: print
Human ans:  sed -n 10p setup_nl2b_fs_1.sh
------------------
LLM:  BASH=/usr/bin/bash
Human ans:  which bash
------------------
LLM:  vmstat 1
Human ans:  vmstat
------------------
LLM:  uptime
Human ans:  w
------------

In [20]:
n_exact_matches

44