In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers bitsandbytes accelerate datasets peft unsloth

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting unsloth
  Downloading unsloth-2025.3.9-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting unsloth_zoo>=2025.3.8 (from unsloth)
  Downloading unsloth_zoo-2025.3.8-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# ✅ Step 1: Load Dataset
data_path = "2501.12948v1_qa_pairs.json"  # Update with your dataset path
dataset = load_dataset("json", data_files=data_path)

# ✅ Step 2: Split Dataset into Train & Test
dataset = dataset["train"].train_test_split(test_size=0.1)

# ✅ Step 3: Load Tokenizer
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Step 4: Define Preprocessing Function
def preprocess_data(examples):
    """Formats instruction-based dataset into a dictionary for training."""
    prompts = []
    outputs = []

    for instruction, input_text, output in zip(examples["instruction"], examples["input"], examples["output"]):
        prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: "

        # Tokenize input (Prompt)
        tokenized_prompt = tokenizer(prompt, padding="max_length", truncation=True, max_length=512)

        # Tokenize output (Expected response)
        tokenized_output = tokenizer(output, padding="max_length", truncation=True, max_length=512)

        # Append to lists
        prompts.append(tokenized_prompt["input_ids"])
        outputs.append(tokenized_output["input_ids"])

    # ✅ Return a dictionary instead of a list (Fixes TypeError)
    return {
        "input_ids": prompts,
        "labels": outputs
    }

# ✅ Step 5: Apply Tokenization to the Dataset
train_dataset = dataset["train"].map(
    preprocess_data,
    batched=True,
    remove_columns=["instruction", "input", "output"]
)

test_dataset = dataset["test"].map(
    preprocess_data,
    batched=True,
    remove_columns=["instruction", "input", "output"]
)

# ✅ Step 6: Print Example to Verify
print(train_dataset[0])  # Check structure



Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

{'input_ids': [16664, 25, 3555, 374, 18090, 2668, 52, 5267, 2505, 25, 2157, 68674, 10951, 84260, 1073, 4128, 8660, 13, 18090, 2668, 52, 25, 2157, 300, 324, 12, 6749, 10951, 84260, 1073, 4128, 8660, 304, 8453, 624, 5097, 25, 220, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151

In [None]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import os
os.environ["WANDB_DISABLED"] = "true"


# ✅ Load Qwen Model with 4-bit Quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# ✅ Prepare Model for Training
model = prepare_model_for_kbit_training(model)

# ✅ Attach LoRA Adapters
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

# ✅ Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,  # Small batch size due to limited VRAM
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulate larger batch size
    logging_dir="./logs",
    fp16=True,
    num_train_epochs=3,
    save_total_limit=1,
    optim="adamw_bnb_8bit",
    remove_unused_columns=False,  # Prevent Trainer from removing dataset columns
)

# ✅ Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# ✅ Free GPU Memory Before Training
torch.cuda.empty_cache()

# ✅ Start Training
trainer.train()

# ✅ Save LoRA Adapters
model.save_pretrained("/content/drive/MyDrive/ML_Stuff/Qwen/fine_tuned_qwen_lora")
tokenizer.save_pretrained("/content/drive/MyDrive/ML_Stuff/Qwen/fine_tuned_qwen_lora")

print("🎉 Fine-tuning completed with QLoRA! LoRA adapters saved.")


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.497548
2,No log,1.540003


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.497548
2,No log,1.267539


🎉 Fine-tuning completed with QLoRA! LoRA adapters saved.


In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

# Load base model
model_name = "Qwen/Qwen2.5-3B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0")

# Load fine-tuned LoRA adapters
fine_tuned_lora_path = "/content/drive/MyDrive/ML_Stuff/Qwen/fine_tuned_qwen_lora"
model = PeftModel.from_pretrained(base_model, fine_tuned_lora_path)

# Merge LoRA weights into the base model
model = model.merge_and_unload()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch
import os
from transformers import AutoModelForCausalLM
from peft import PeftModel

# ✅ Check RAM & Disk Space Before Merging
os.system("free -h")  # Show available RAM
os.system("df -h")    # Show disk space usage

# ✅ Load Base Model with Memory Optimization
model_name = "Qwen/Qwen2.5-3B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Reduce memory usage
    device_map="cpu"  # Load on CPU to prevent GPU crashes
)

# ✅ Free GPU Memory Before Merging (if using Colab/Kaggle)
torch.cuda.empty_cache()

# ✅ Load Fine-Tuned LoRA Adapters
fine_tuned_lora_path = "/content/drive/MyDrive/ML_Stuff/Qwen/fine_tuned_qwen_lora"
model = PeftModel.from_pretrained(base_model, fine_tuned_lora_path).to("cpu")

# ✅ Merge LoRA Weights into Base Model
print("Merging LoRA weights into base model... This may take some time.")
model = model.merge_and_unload()

# ✅ Save the Fully Merged Model
merged_model_path = "/content/drive/MyDrive/ML_Stuff/Qwen/qwen2.5_3b_merged"
model.save_pretrained(merged_model_path)

print(f"🎉 LoRA weights merged successfully! Model saved at {merged_model_path}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Merging LoRA weights into base model... This may take some time.
🎉 LoRA weights merged successfully! Model saved at /content/drive/MyDrive/ML_Stuff/Qwen/qwen2.5_3b_merged


In [None]:
!pip install llama-cpp-python transformers


Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.7.tar.gz (66.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.7-cp311-cp311-linux_x86_64.whl size=4552804 sha256=3d64a10324b6a302d320

In [None]:
# Install required packages
!apt-get install -y cmake build-essential
!pip install llama-cpp-python
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.7.tar.gz (66.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:

In [None]:
!mkdir build
%cd build
!cmake ..
!cmake --build . --config Release
%cd /content/

/content/llama.cpp/build
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- Configuring done (2.

In [None]:
!pip install ctranslate2

Collecting ctranslate2
  Downloading ctranslate2-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading ctranslate2-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ctranslate2
Successfully installed ctranslate2-4.5.0


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import subprocess

model_path = "/content/drive/MyDrive/ML_Stuff/Qwen/qwen2.5_3b_merged"  # Replace with your model path
converted_model_path = model_path + "_converted"
gguf_model_path = model_path + "_gguf.gguf"

# Load model in a memory-efficient way (handles multiple safetensors automatically)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",  # Use automatic precision to reduce memory usage
    low_cpu_mem_usage=True,  # Optimize memory during loading
    device_map="auto",  # Load model on available GPU/CPU
    use_safetensors=True  # Ensure safetensors are correctly loaded
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Save the model in ctranslate2 format
model.save_pretrained(
    converted_model_path + "_ctranslate2",
    from_pt=True,
    max_shard_size="500MB"  # Reduce shard size to prevent memory overflow
)
tokenizer.save_pretrained(converted_model_path + "_ctranslate2")

# Convert to GGUF using llama.cpp
subprocess.run([
    "/content/llama.cpp/convert_hf_to_gguf.py", model_path, "--outfile", gguf_model_path
])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CompletedProcess(args=['/content/llama.cpp/convert_hf_to_gguf.py', '/content/drive/MyDrive/ML_Stuff/Qwen/qwen2.5_3b_merged', '--outfile', '/content/drive/MyDrive/ML_Stuff/Qwen/qwen2.5_3b_merged_gguf'], returncode=0)