### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm==0.8.5
    !pip install git+https://github.com/meta-llama/synthetic-data-kit.git
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm==0.8.5
    !pip install git+https://github.com/meta-llama/synthetic-data-kit.git


In [2]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

### Unsloth



We'll use https://github.com/meta-llama/synthetic-data-kit to generate question and answer pairs **fully locally** which will be used for finetuning Llama 3.2 3B!

In [None]:
from unsloth.dataprep import SyntheticDataKit

generator = SyntheticDataKit.from_pretrained(
    # Choose any model from https://huggingface.co/unsloth
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048, # Longer sequence lengths will be slower!
)

## Generate QA Pairs + Auto clean data
We now use synthetic data kit for question answer pair generation:

In [4]:
generator.prepare_qa_generation(
    output_folder = "data", # Output location of synthetic data
    temperature = 0.7, # Higher temp makes more diverse datases
    top_p = 0.95,
    overlap = 64, # Overlap portion during chunking
    max_generation_tokens = 512, # Can increase for longer QA pairs
)

Check if it succeeded:

In [None]:
!synthetic-data-kit system-check

## Document Parsing (PDF, CSV, HTML etc.)
Now, let's take the Byte Latent Transformer: Patches Scale Better Than Tokens research paper found at https://arxiv.org/abs/2412.09871 and covert it to Q&A pairs in order to finetune Llama 3.2!

In [6]:
# Step 1: Ingest PDF
!synthetic-data-kit \
    -c synthetic_data_kit_config.yaml \
    ingest "research_paper.pdf"

# Step 2: Chunk the text output
filenames = generator.chunk_data("data/output/research_paper.txt")
print(len(filenames), filenames[:3])

import time

# Step 3: Generate 25 QA pairs from first 3 chunks
for filename in filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create {filename} \
        --num-pairs 25 \
        --type "qa"
    time.sleep(2)

# Step 4: Save QA pairs in OpenAI fine-tuning format
qa_pairs_filenames = [
    f"data/generated/research_paper_{i}_qa_pairs.json"
    for i in range(len(filenames))
]

for filename in qa_pairs_filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        save-as {filename} -f ft

# Step 5: Load into HuggingFace Dataset
from datasets import Dataset
import pandas as pd

final_filenames = [
    f"data/final/research_paper_{i}_qa_pairs_ft.json"
    for i in range(len(filenames))
]

conversations = pd.concat([
    pd.read_json(name) for name in final_filenames
]).reset_index(drop=True)

dataset = Dataset.from_pandas(conversations)

[2K[32m⠙[0m Processing research_paper.pdf...
[1A[2K[32m Text successfully extracted to [0m[1;32mdata/output/research_paper.txt[0m
8 ['data/output/research_paper_0.txt', 'data/output/research_paper_1.txt', 'data/output/research_paper_2.txt']
[2KProcessing 4 chunks to generate QA pairs...
[2KBatch processing complete.
[2KGenerated 23 QA pairs total
[2KSaving result to data/generated/research_paper_0_qa_pairs.json
[2KSuccessfully wrote test file to data/generated/test_write.json
[2KSuccessfully wrote result to data/generated/research_paper_0_qa_pairs.json
[2K[32m⠹[0m Generating qa content from data/output/research_paper_0.txt...
[1A[2K[32m Content saved to [0m[1;32mdata/generated/research_paper_0_qa_pairs.json[0m
[2KProcessing 4 chunks to generate QA pairs...
[2KBatch processing complete.
[2KGenerated 20 QA pairs total
[2KSaving result to data/generated/research_paper_1_qa_pairs.json
[2KSuccessfully wrote test file to data/generated/test_write.json
[2KSuccess

In [7]:
# # Byte Latent Transformer: Patches Scale Better Than Tokens paper in HTML format
# !synthetic-data-kit \
#     -c synthetic_data_kit_config.yaml \
#     ingest "https://arxiv.org/html/2412.09871v1"

# # Truncate document
# filenames = generator.chunk_data("data/output/arxiv_org.txt")
# print(len(filenames), filenames[:3])

In [8]:
# import time
# # Process 3 chunks for now -> can increase but slower!
# for filename in filenames[:3]:
#     !synthetic-data-kit \
#         -c synthetic_data_kit_config.yaml \
#         create {filename} \
#         --num-pairs 25 \
#         --type "qa"
#     time.sleep(2) # Sleep some time to leave some room for processing

Optionally, you can clean up the data via pruning "bad" or low quality examples and convert the rest to JSON format for finetuning!

In [9]:
# !synthetic-data-kit \
#     -c synthetic_data_kit_config.yaml \
#     curate --threshold 0.0 \
#     "data/generated/arxiv_org_0_qa_pairs.json"

We now convert the generated datasets into QA formats so we can load it for finetuning:

In [10]:
# qa_pairs_filenames = [
#     f"data/generated/arxiv_org_{i}_qa_pairs.json"
#     for i in range(len(filenames[:3]))
# ]
# for filename in qa_pairs_filenames:
#     !synthetic-data-kit \
#         -c synthetic_data_kit_config.yaml \
#         save-as {filename} -f ft

Let's load up the data and see what the synthetic data looks like!

In [11]:
# from datasets import Dataset
# import pandas as pd
# final_filenames = [
#     # f"data/final/arxiv_org_{i}_qa_pairs_ft.json"
#     for i in range(len(filenames[:3]))
# ]
# conversations = pd.concat([
#     pd.read_json(name) for name in final_filenames
# ]).reset_index(drop = True)

# dataset = Dataset.from_pandas(conversations)

In [12]:
dataset[0]

{'messages': [{'content': 'You are a helpful assistant.', 'role': 'system'},
  {'content': 'Whoever, intending to facilitate or knowing it to be likely that he will thereby facilitate the concealment of design to commit offence punishable with imprisonment.',
   'role': 'user'},
  {'content': 'Section 120.', 'role': 'assistant'}]}

In [None]:
for i in range(0,34):
  print(dataset[i])


In [14]:
len(dataset)


196

Finally free vLLM process to save memory and to allow for finetuning!

In [15]:
generator.cleanup()

Attempting to terminate the VLLM server gracefully...
Server terminated gracefully.


### Fine-tuning Synthetic Dataset with Unsloth

In [16]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
    # Qwen3 new models
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    # Other very popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [17]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Llama-3.2` format for conversation style finetunes. The chat template renders conversations like below: (Cutting Knowledge Date is by default there!)

```
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 May 2025

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is 1+1?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

2<|eot_id|>
```

In [18]:
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# Get our previous dataset and format it:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

See result of the first row:

In [19]:
dataset[0]

{'messages': [{'content': 'You are a helpful assistant.', 'role': 'system'},
  {'content': 'Whoever, intending to facilitate or knowing it to be likely that he will thereby facilitate the concealment of design to commit offence punishable with imprisonment.',
   'role': 'user'},
  {'content': 'Section 120.', 'role': 'assistant'}],
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 May 2025\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhoever, intending to facilitate or knowing it to be likely that he will thereby facilitate the concealment of design to commit offence punishable with imprisonment.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nSection 120.<|eot_id|>'}

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [20]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/196 [00:00<?, ? examples/s]

In [21]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
1.088 GB of memory reserved.


In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 196 | Num Epochs = 3 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Step,Training Loss
1,4.3505
2,4.3387
3,4.2388
4,3.9467
5,3.141
6,3.2399
7,2.6949
8,2.3692
9,2.0351
10,1.8751


In [23]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

65.8848 seconds used for training.
1.1 minutes used for training.
Peak reserved memory = 1.582 GB.
Peak reserved memory for training = 0.494 GB.
Peak reserved memory % of max memory = 10.732 %.
Peak reserved memory for training % of max memory = 3.351 %.


<a name="Inference"></a>
### Inference
Let's run the model! Use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [24]:
messages = [
    {"role": "user", "content": "According to the text, what is the minimum age for a woman to be regarded as capable of giving consent?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer,
                   max_new_tokens = 256, temperature = 0.1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The text does not mention the minimum age for a woman to be regarded as capable of giving consent.<|eot_id|>


The model learns about the research paper!!

In [25]:
messages = [
    {"role": "user", "content": "What is the punishment for obstructing the sale of property offered by a public servant?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer,
                   max_new_tokens = 256, temperature = 0.1)

Punishment: 10 Years + Fine<|eot_id|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [26]:
model.save_pretrained("indianlegal1")  # Local saving
tokenizer.save_pretrained("indianlegal1")

# Online saving to Hugging Face Hub
model.push_to_hub("Shritu/indianlegal1", token="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")
tokenizer.push_to_hub("Shritu/indianlegal1", token="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")


README.md:   0%|          | 0.00/611 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Shritu/indianlegal1


No files have been modified since last commit. Skipping to prevent empty commit.


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [27]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
messages = [
    {"role": "user", "content": "What is so special about BLT's tokenization?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer,
                   max_new_tokens = 256, temperature = 0.1)

It is special because it is a tokenization of a BLT.<|eot_id|>


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [33]:
# Merge to 16bit
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: # Change to True to upload finetune
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "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")

# Merge to 4bit
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: # Change to True to upload finetune
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "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")

# Just LoRA adapters
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: # Change to True to upload finetune
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

Unsloth: You are pushing to hub, but you passed your HF username = hf.
We shall truncate hf/model to model


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.52 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 25%|██▌       | 4/16 [00:00<00:00, 29.29it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 32.12 MiB is free. Process 19692 has 12.75 GiB memory in use. Process 140596 has 1.95 GiB memory in use. Of the allocated memory 1.73 GiB is allocated by PyTorch, and 69.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [34]:
# Save to 8bit Q8_0
if False:
    model.save_pretrained_gguf("model", tokenizer,)
if False: # Change to True to upload finetune
    model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: # Change to True to upload finetune
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if True: # Change to True to upload finetune
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "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")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.66 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  0%|          | 0/16 [00:00<?, ?it/s]


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.62 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  0%|          | 0/16 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 12.12 MiB is free. Process 19692 has 12.75 GiB memory in use. Process 140596 has 1.97 GiB memory in use. Of the allocated memory 1.75 GiB is allocated by PyTorch, and 65.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import os
import torch
from unsloth import load_model

# Set memory configuration to reduce fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load model on CPU to avoid GPU OOM issues
model = load_model("path/to/4bit_model", device_map="cpu")

# Merge and save to 16bit (f16) or lower quantization
model.push_to_hub_gguf(
    "hf/model",
    tokenizer="path/to/tokenizer",
    quantization_method="f16",  # Change to "q4_k_m" if 16bit is too large
    token="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"
)

print("✅ Model pushed successfully!")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Replace with your model name
model_name = "Shritu/indianlegal1"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Generate a response (example)
inputs = tokenizer("What is the legal process in India for filing a case?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)
