In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#     "unsloth/llama-2-7b-bnb-4bit",
#     "unsloth/llama-2-13b-bnb-4bit",
#     "unsloth/codellama-34b-bnb-4bit",
#     "unsloth/tinyllama-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
#     "unsloth/gemma-2b-bnb-4bit",
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.9: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [5]:
!git clone https://github.com/gita/Datasets

Cloning into 'Datasets'...
remote: Enumerating objects: 1986, done.[K
remote: Counting objects: 100% (247/247), done.[K
remote: Compressing objects: 100% (246/246), done.[K
remote: Total 1986 (delta 2), reused 219 (delta 0), pack-reused 1739 (from 1)[K
Receiving objects: 100% (1986/1986), 220.96 MiB | 14.75 MiB/s, done.
Resolving deltas: 100% (832/832), done.
Updating files: 100% (1324/1324), done.


In [7]:
import json
import os

# Define the path to the directory containing your JSON files
base_dir = '/content/Datasets/Ayurveda/charak-samhita'

# Initialize a list to hold all conversations
all_conversations = []

# Traverse through all subdirectories and files
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for entry in data:
                    verse_id = entry.get('verse_id', 'Unknown')
                    text = entry.get('text', '')
                    # Create a prompt and response
                    prompt = f"Explain verse {verse_id}: {text}"
                    # Placeholder response; replace with actual explanation if available
                    response = "This verse discusses..."
                    conversation = {
                        "conversations": [
                            {"from": "human", "value": prompt},
                            {"from": "gpt", "value": response}
                        ]
                    }
                    all_conversations.append(conversation)

# Write all conversations to a JSONL file
with open('charak_samhita_conversations.jsonl', 'w', encoding='utf-8') as f:
    for convo in all_conversations:
        json.dump(convo, f, ensure_ascii=False)
        f.write('\n')


In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [26]:
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Load the dataset from the JSONL file
dataset = load_dataset('json', data_files='charak_samhita_conversations.jsonl', split='train')

# Define the formatting function
def formatting_func(examples):
    texts = []
    # examples is a dictionary with keys corresponding to dataset columns
    # examples["conversations"] is a list of lists, where each inner list is a conversation for one example in the batch
    if "conversations" in examples and isinstance(examples["conversations"], list):
        for conversation_list in examples["conversations"]:
            formatted_conversation = ""
            # Ensure conversation_list is a list before iterating
            if isinstance(conversation_list, list):
                # conversation_list is a list of dictionaries, where each dictionary is a turn in the conversation
                for turn in conversation_list:
                    # Ensure turn is a dictionary before accessing keys
                    if isinstance(turn, dict):
                        if turn.get("from") == "human":
                            formatted_conversation += f"### Human:\n{turn.get('value', '')}\n"
                        elif turn.get("from") == "gpt":
                            formatted_conversation += f"### Assistant:\n{turn.get('value', '')}\n"
            texts.append(formatted_conversation + "### End") # Add a separator at the end
    # Return the list of processed strings directly
    return texts


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text", # Update this to the correct field name
    max_seq_length = max_seq_length,
    dataset_num_proc = 1, # Changed from 2 to 1
    packing = False, # Can make training 5x faster for short sequences.
    formatting_func = formatting_func, # Add the formatting function here
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/8215 [00:00<?, ? examples/s]

In [31]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,215 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 0/7,000,000,000 (0.00% trained)


AttributeError: 'MistralAttention' object has no attribute 'apply_qkv'

In [21]:
model.save_pretrained("fine_tuned_mistral")
tokenizer.save_pretrained("fine_tuned_mistral")

('fine_tuned_mistral/tokenizer_config.json',
 'fine_tuned_mistral/special_tokens_map.json',
 'fine_tuned_mistral/tokenizer.model',
 'fine_tuned_mistral/added_tokens.json',
 'fine_tuned_mistral/tokenizer.json')

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("/content/fine_tuned_mistral")
model = AutoModelForCausalLM.from_pretrained("/content/fine_tuned_mistral")

In [24]:
input_text = "Explain verse 1: We shall now expound the chapter entitled..."

In [30]:
import torch
# Import FastLanguageModel
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# 1. Select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Load your saved model & tokenizer using FastLanguageModel
model_dir = "./fine_tuned_mistral"  # or your path
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the model using FastLanguageModel.from_pretrained
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_dir, # Load from the saved directory
    max_seq_length = max_seq_length, # Use the same max_seq_length as before
    dtype = None, # Auto detect
    load_in_4bit = True, # Use 4bit quantization
)

# 3. Move model to GPU/CPU (FastLanguageModel already handles device placement, but keeping for clarity if needed)
# model.to(device) # FastLanguageModel handles this

# 4. Prepare your prompt
input_text = "Explain verse 1: We shall now expound the chapter entitled..."

# 5. Tokenize and move tensors to the same device
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()} # Use model.device

# 6. Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=False,      # greedy; you can also experiment with sampling
    pad_token_id=tokenizer.eos_token_id,
)

# 7. Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

==((====))==  Unsloth 2025.5.9: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Explain verse 1: We shall now expound the chapter entitled...
Explain verse 2:  The chapter on the...
Explain verse 3:  The chapter on the...
Explain verse 4:  The chapter on the...
Explain verse 5:  The chapter on the...
Explain verse 6:  The chapter on the...
Explain verse 7:  The chapter on the...
Explain verse 8:  The chapter on the...
Explain verse 9:  The
