In [2]:
import unsloth
from unsloth import FastLanguageModel
import torch
import os

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [3]:
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.3.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 1. Max memory: 11.719 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.3.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [6]:
def read_markdown_files(directory):
    markdown_contents = []
    text_contents = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8", errors='replace') as f:
                        content = f.read()
                        markdown_contents.append(content)
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")
            elif file.endswith(".txt"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8", errors='replace') as f:
                        content = f.read()
                        text_contents.append(content)
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

    return markdown_contents, text_contents

In [7]:
# with open("../documentation/basics/GeneralTaskKnowledge/Cutting/Cutting_Locations.md") as f:
#     data1 = f.read()

docs_content, text_content = read_markdown_files(os.path.join(os.curdir, "../documentation"))

In [8]:
# from datasets import load_dataset
# dataset = load_dataset("roneneldan/TinyStories", split = "train[:100]")

print(docs_content[2])

Can usability models help robots learn and improve interaction quality through task data and user feedback?Yes, usability models can indeed support learning in robots to enhance usability over time. In fact, considering usability during the design and development of robotic systems is of paramount importance.

## Use of Usability Models in Robots

Usability models can serve as a guide for how robots should interact with users, helping them to achieve their goals efficiently, effectively, and with a high degree of satisfaction. Over time, as the robot interacts more with its users, it can use these models to learn and adapt itself to better meet the user's needs.

## Task Data & User Feedback

### Task Data

Task data refers to the specific tasks that the robot is programmed to perform. This data can include information on the success or failure of tasks, the time taken to complete tasks, and other metrics that are relevant to the task's performance.

Robots can use this data to improve

In [9]:
from datasets import Dataset, load_dataset, concatenate_datasets
data1 = {"text": text_content}
datasets1 = Dataset.from_dict(data1)

In [10]:
from datasets import Dataset, load_dataset
data2 = {"text": docs_content}
datasets2 = Dataset.from_dict(data2)

In [11]:
datasets = concatenate_datasets([datasets1, datasets2])

In [12]:
EOS_TOKEN = tokenizer.eos_token

In [13]:
def formatting_prompts_func(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
datasets = datasets.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

In [14]:
for row in datasets[3:4]["text"]:
    print("=========================")
    print(row)

query: cut the apple into 4 equal pieces on the table using the sharp black knife


response: comprehensive cram action designator in lisp format is as below
(an action
    (type cutting)
    (object (an object
              (type apple)
              (name "standard-apple")
              (properties (size "medium")
                          (texture "smooth")
                          (color "red")))))
    (tool (a tool
            (type knife)
            (name "sharp-black-knife")
            (properties (sharpness "very-high")
                        (size "medium")
                        (material "steel")
                        (weight "light")
                        (color "black")
                        (edge "smooth")))))
    (location (a location
                (type table)
                (name "kitchen-table")
                (properties (material "wood")
                            (height 0.9)
                            (accessibility "high")
                       

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = datasets,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        warmup_ratio = 0.1,
        num_train_epochs = 5,

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,
        max_steps=100,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Tokenizing to ["text"] (num_proc=8):   0%|          | 0/574 [00:00<?, ? examples/s]

In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 574 | Num Epochs = 3 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 603,979,776/4,362,342,400 (13.85% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.2524
2,1.388
3,1.2866
4,1.2932
5,1.1933
6,1.1301
7,1.2623
8,1.2154
9,1.1563
10,1.2352


## Inference

In [26]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

prompts = [
    "what is the comprehensive action designator for the task- cut the apple. Give it in lisp format, no further explanation is needed",
    "what are the flanagan motion phases involved in the task- cut the apple. Give it in json format, no further explanation is needed",
    "what are the motion constraints involved in the task- cut the apple. Give it in json format, no further explanation is needed",
    "what are the framenet elements involved in the task- cut the apple. Give it in json format, no further explanation is needed",
    "what are the object and tools involved in the task- cut the apple. Give it in json format, no further explanation is needed"
]

inputs = tokenizer(
[
    prompts[0]
]*1, return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 2048,
    use_cache = True,
)
thread = Thread(target = model.generate, kwargs = generation_kwargs)
thread.start()

length = 0
for j, new_text in enumerate(text_streamer):
    if j == 0:
        wrapped_text = textwrap.wrap(new_text, width = max_print_width)
        length = len(wrapped_text[-1])
        wrapped_text = "\n".join(wrapped_text)
        print(wrapped_text, end = "")
    else:
        length += len(new_text)
        if length >= max_print_width:
            length = 0
            print()
        print(new_text, end = "")
    pass
pass

<s> what is the comprehensive action designator for the task- cut the apple. Give it in lisp format,
no further explanation isneeded.

The Comprehensive Action Designator (CAD) for the task "cut the 
apple" in Lisp format would be structured as follows:

(an action
   (type cutting)
   (object (an object

             (type apple)
             (name "standard-red-apple")
             (properties (size "medium")

                         (texture "smooth")
                         (color "red")))))
   (tool (a tool
          
 (type knife)
           (name "chef-knife")
           (properties (sharpness "very-high")

                       (size "large")
                       (material "stainless-steel")
                       
(weight "heavy")
                       (color "silver")))))
   (location (a location
               (type 
cutting-board)
               (name "kitchen-cutting-board")
               (properties (material "bamboo")

                           (height 0.05)
    

### Saving the model

In [32]:
model.save_pretrained("lora_model_trained_2")
tokenizer.save_pretrained("lora_model_trained_2")

('lora_model_trained_2/tokenizer_config.json',
 'lora_model_trained_2/special_tokens_map.json',
 'lora_model_trained_2/tokenizer.model',
 'lora_model_trained_2/added_tokens.json',
 'lora_model_trained_2/tokenizer.json')

In [25]:
if True:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model_trained",
        max_seq_length = max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit
    )

    FastLanguageModel.for_inference(model)

pass

messages = [
    {
        "role" : "user",
        "content" : "what are the flanagan motion phases involved in the task- cut the apple, give it in json format, no further explanation is needed"
     }
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

==((====))==  Unsloth 2025.3.6: Fast Mistral patching. Transformers: 4.48.3.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 1. Max memory: 11.719 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

In [None]:
# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer)

# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!

# if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
# if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")

# if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

# if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options much faster if you want multiple!
# if False:
#     model.push_to_hub_gguf(
#         "hf/model", # Change hf to your username!
#         tokenizer,
#         quantization_method = ["q4_k_m", "q8_0", "q5_k_m"],
#         token = "" # Get a token at https://huggingface.co/settings/tokens
#     )

## Widgets Options

In [31]:
# import ipywidgets as widgets
# from IPython.display import display
#
# # Create a dropdown widget for prompt selection
# prompt_dropdown = widgets.Dropdown(
#     options=prompts,
#     description="Select Prompt:",
#     disabled=False,
# )
#
# # Create a button to trigger generation
# generate_button = widgets.Button(description="Generate Response")
#
# # Create an output widget to display the generated text
# output_widget = widgets.Output()
#
# def generate_response(b):
#     selected_prompt = prompt_dropdown.value
#     with output_widget:
#         output_widget.clear_output()  # Clear previous output
#         print(f"Generating response for: {selected_prompt}")
#         text_streamer = TextIteratorStreamer(tokenizer)
#         max_print_width = 100
#
#         inputs = tokenizer([selected_prompt], return_tensors="pt").to("cuda")
#
#         generation_kwargs = dict(
#             inputs,
#             streamer=text_streamer,
#             max_new_tokens=2048,
#             use_cache=True,
#         )
#         thread = Thread(target=model.generate, kwargs=generation_kwargs)
#         thread.start()
#
#         length = 0
#         for j, new_text in enumerate(text_streamer):
#             if j == 0:
#                 wrapped_text = textwrap.wrap(new_text, width=max_print_width)
#                 length = len(wrapped_text[-1])
#                 wrapped_text = "\n".join(wrapped_text)
#                 print(wrapped_text, end="")
#             else:
#                 length += len(new_text)
#                 if length >= max_print_width:
#                     length = 0
#                     print()
#                 print(new_text, end="")
#             pass
#         print("\n")
#
# # Link the button click to the generation function
# generate_button.on_click(generate_response)
#
# # Display the widgets
# display(prompt_dropdown, generate_button, output_widget)

## GEMMA3

In [19]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",


    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4B-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

RuntimeError: Unsloth: Gemma 3 only works on transformers >= 4.50.0.
Please use nightly transformers via pip install --upgrade "transformers>=4.49.0"`

In [16]:
# !pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
# !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
# !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
# !pip install --no-deps unsloth
# !pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
!pip install --upgrade "transformers>=4.49.0"

