In [None]:


%%capture
!pip install pymupdf pandas python-docx openpyxl

import fitz
import pandas as pd
from docx import Document
import os
import glob


def read_pdf(file_path):
    """Extracts text from PDF, removing extra whitespace."""
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def read_docx(file_path):
    """Extracts text from Word Documents."""
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text])

def read_excel(file_path):
    """Converts Excel rows into string representation."""
    df = pd.read_excel(file_path)
    text_data = []
    for _, row in df.iterrows():
        row_str = ", ".join([f"{col}: {val}" for col, val in row.items() if pd.notna(val)])
        text_data.append(row_str)
    return "\n".join(text_data)

def process_directory(directory_path):
    """Loops through a folder and extracts text from all supported files."""
    raw_data = []

    extensions = ['*.pdf', '*.docx', '*.xlsx']
    files = []
    for ext in extensions:
        files.extend(glob.glob(os.path.join(directory_path, ext)))

    print(f"Found {len(files)} files to process.")

    for file in files:
        print(f"Processing: {file}")
        try:
            if file.endswith('.pdf'):
                content = read_pdf(file)
            elif file.endswith('.docx'):
                content = read_docx(file)
            elif file.endswith('.xlsx'):
                content = read_excel(file)
            raw_data.append({"filename": file, "content": content})
        except Exception as e:
            print(f"Error processing {file}: {e}")

    return raw_data

print("Ingestion pipeline ready. Upload your files to the Colab 'Files' tab.")

In [None]:

import torch
try:
    major_version, minor_version = torch.cuda.get_device_capability()
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
except:
    print("No GPU detected. Ensure Runtime > Change Runtime Type is set to GPU.")

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install --no-deps xformers trl peft accelerate bitsandbytes

print("Installation Complete.")

GPU Detected: NVIDIA A100-SXM4-80GB
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-wtjniv_9/unsloth_3b0db11fc73e4ccba97275962a097557
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-wtjniv_9/unsloth_3b0db11fc73e4ccba97275962a097557
  Resolved https://github.com/unslothai/unsloth.git to commit f08b337ee05d52b73a5071c68ccd4b3fa8cf0645
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.11.5 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.11.5-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+htt

Collecting xformers
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl (122.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers
Successfully installed xformers-0.0.33.post1
Installation Complete.


In [None]:

%%capture
!pip install llama-parse llama-index-core nest_asyncio pandas python-docx openpyxl

import nest_asyncio
nest_asyncio.apply()

import os
import glob
import pandas as pd
from docx import Document
from llama_parse import LlamaParse

api_key = "llx-2tNMntcRo6lJYDamKrbtQAFIU2U3kTv28VC9g30k7nGsk7t2"

os.environ["LLAMA_CLOUD_API_KEY"] = api_key


def read_pdf_llama(file_path):
    """
    Uses LlamaParse to extract text while preserving table structure
    as Markdown (pipes | and dashes -).
    """
    print(f"   ...Sending {os.path.basename(file_path)} to LlamaCloud...")

    parser = LlamaParse(
        result_type="markdown",
        verbose=False,
        language="en"
    )

    try:
        documents = parser.load_data(file_path)
        full_text = "\n\n".join([doc.text for doc in documents])
        return full_text
    except Exception as e:
        print(f"   ❌ LlamaParse Error: {e}")
        return ""

def read_docx(file_path):
    """Extracts text from Word Documents."""
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text])

def read_excel(file_path):
    """Converts Excel rows into string representation."""
    df = pd.read_excel(file_path)
    text_data = []
    for _, row in df.iterrows():
        row_str = ", ".join([f"{col}: {val}" for col, val in row.items() if pd.notna(val)])
        text_data.append(row_str)
    return "\n".join(text_data)

def process_directory(directory_path="."):
    """Loops through folder and extracts text using the best tool for each file."""
    raw_data = []
    extensions = ['*.pdf', '*.docx', '*.xlsx']
    files = []

    for ext in extensions:
        files.extend(glob.glob(os.path.join(directory_path, ext)))

    print(f"Found {len(files)} files to process.")

    for file in files:
        print(f"Processing: {file}")
        content = ""
        try:
            if file.endswith('.pdf'):
                content = read_pdf_llama(file)
            elif file.endswith('.docx'):
                content = read_docx(file)
            elif file.endswith('.xlsx'):
                content = read_excel(file)

            if content:
                raw_data.append({"filename": file, "content": content})
                print(f"   ✅ Success: {len(content)} chars extracted.")
            else:
                print("   ⚠️ Warning: No content extracted.")

        except Exception as e:
            print(f"Error processing {file}: {e}")

    return raw_data

if api_key == "PASTE_YOUR_LLAMA_INDEX_API_KEY_HERE":
    print("❌ STOP: You must paste your API Key in the 'api_key' variable at the top of the script!")
else:
    print("Starting LlamaParse Ingestion Engine...")
    ingested_data = process_directory("./data")

    if ingested_data:
        print("\n--- SAMPLE OUTPUT (First 500 chars) ---")
        print(ingested_data[0]['content'][:500])
        if "|" in ingested_data[0]['content']:
            print("\n✅ Tables detected in Markdown format.")

In [None]:
import json

CHUNK_SIZE = 1024
OVERLAP = 100

def create_chunks(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    """
    Splits text into overlapping chunks.
    """
    chunks = []
    start = 0
    text_len = len(text)

    while start < text_len:
        end = start + chunk_size
        chunk = text[start:end]

        if end < text_len:
            last_space = chunk.rfind(' ')
            if last_space != -1:
                end = start + last_space
                chunk = text[start:end]

        chunks.append(chunk)
        start = end - overlap

    return chunks

print("⚙️ Processing Data into Training Examples...")
training_data = []

\if 'ingested_data' not in globals():
    print("❌ Error: 'ingested_data' not found. Please run the Ingestion script first.")
else:
    for doc in ingested_data:
        filename = doc['filename']
        content = doc['content']

        doc_chunks = create_chunks(content)
        print(f"   📄 {filename}: Created {len(doc_chunks)} chunks.")

        for i, chunk in enumerate(doc_chunks):
            entry = {
                "instruction": f"You are an expert technical assistant. Provide details from the document '{filename}'.",
                "input": "",
                "output": chunk
            }
            training_data.append(entry)

    output_file = "dataset.json"
    with open(output_file, "w") as f:
        json.dump(training_data, f, indent=2)

    print(f"\n✅ SUCCESS: Dataset saved as '{output_file}' with {len(training_data)} training examples.")
    print("   Ready for Model Selection.")

⚙️ Processing Data into Training Examples...
   📄 ./data/TDS_ControlSpace_EX-1280C_LTR_enUS.pdf: Created 14 chunks.
   📄 ./data/tds_DesignMax_DM8SE_a4_EN.pdf: Created 11 chunks.

✅ SUCCESS: Dataset saved as 'dataset.json' with 25 training examples.
   Ready for Model Selection.


In [None]:
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm


max_seq_length = 2048
dtype = None
load_in_4bit = True

print("🚀 Loading Base Model for Data Generation...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

generation_prompt = """You are an expert data analyst.
Below is a technical excerpt from a product manual.
Generate 5 distinct, specific questions that a customer might ask, which can be answered ONLY using this excerpt.
Do not provide the answers, only the questions.

### Excerpt:
{}

### Questions:
1."""

enhanced_data = []
print(f"\n⚙️ Generating Synthetic Data from {len(training_data)} chunks...")

for entry in tqdm(training_data):
    context = entry['output']
    inputs = tokenizer(
        [generation_prompt.format(context)],
        return_tensors = "pt"
    ).to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens = 256,
        use_cache = True
    )
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    raw_questions = generated_text.split("### Questions:")[-1].strip()

    questions = [q.strip() for q in raw_questions.split('\n') if q.strip() and (q[0].isdigit() or q.startswith('-'))]

    for q in questions:
        clean_q = q.lstrip("1234567890.- ")

        new_entry = {
            "instruction": clean_q,
            "input": "",
            "output": context
        }
        enhanced_data.append(new_entry)

import json
combined_data = training_data + enhanced_data

with open("enhanced_dataset.json", "w") as f:
    json.dump(combined_data, f, indent=2)

print(f"\n✅ GENERATION COMPLETE!")
print(f"Original Chunks: {len(training_data)}")
print(f"Synthetic Questions Created: {len(enhanced_data)}")
print(f"Total Training Data: {len(combined_data)}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
🚀 Loading Base Model for Data Generation...
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]


⚙️ Generating Synthetic Data from 25 chunks...


100%|██████████| 25/25 [01:31<00:00,  3.67s/it]


✅ GENERATION COMPLETE!
Original Chunks: 25
Synthetic Questions Created: 125
Total Training Data: 150





In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("🧠 Loading Llama-3-8B-Instruct Architecture...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("✅ Model Architecture Ready. Adapters attached.")

🧠 Loading Llama-3-8B-Instruct Architecture...
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


✅ Model Architecture Ready. Adapters attached.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import load_dataset

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

dataset = load_dataset("json", data_files="enhanced_dataset.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

print(f"✅ Dataset Loaded: {len(dataset)} examples ready for training.")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

print("🚀 Training Started... (This takes 2-5 mins)")
trainer_stats = trainer.train()
print("🏆 Training Complete!")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

✅ Dataset Loaded: 150 examples ready for training.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/150 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🚀 Training Started... (This takes 2-5 mins)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 150 | Num Epochs = 4 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7212
2,3.1294
3,3.0638
4,3.0174
5,2.915
6,2.5578
7,2.3911
8,2.3383
9,2.4809
10,2.1679


🏆 Training Complete!


In [None]:
import re

FastLanguageModel.for_inference(model)

topic = "frequency response"
product = "DesignMax DM8SE"

question = f"What is the {topic} of the {product}?"
style_instruction = "Extract the specific value and answer in a single, plain English sentence. Do NOT use markdown tables, pipes (|), or lists."

prompt = alpaca_prompt.format(
    f"{question}\n\n{style_instruction}",
    "",
    "",
)

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    use_cache=True,
    temperature=0.1,
)

def clean_output(text):
    response = text.split("### Response:")[-1].strip()

    response = response.replace("|", "")
    response = response.replace("---", "")

    response = re.sub(' +', ' ', response)

    response = re.sub(r'\'', '', response)

    return response.strip()

final_answer = clean_output(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

print("\n" + "="*50)
print(f"🤖 AI ASSISTANT REPORT")
print("="*50)
print(f"📝 Query: {question}")
print("-" * 50)
print(f"💬 Answer:\n")
print(final_answer)
print("\n" + "="*50)


🤖 AI ASSISTANT REPORT
📝 Query: What is the frequency response of the DesignMax DM8SE?
--------------------------------------------------
💬 Answer:

- Frequency Response: 52 Hz – 20 kHz
- Sensitivity (SPL / 1 W @ 1 m): 93 dB
- Calculated Maximum SPL @ 1 m (continuous): 115 dB
- Calculated Maximum SPL @ 1 m (peak): 121 dB
- Crossover: 1.5 kHz (passive 2-way crossover)
- Loudspeaker EQ: Not required; Bose Professional voicing and SmartBass processing available to enhance performance and response

For additional specifications and application information, visit BoseProfessional.com. Specifications are subject to change. 11/2024



In [None]:
import json
import os
import glob
from llama_parse import LlamaParse
import nest_asyncio
nest_asyncio.apply()

current_dataset_file = "enhanced_dataset.json"
new_files_directory = "new_docs"
output_file = "combined_dataset_v2.json"

os.makedirs(new_files_directory, exist_ok=True)

print(f"Instructions: Upload ONLY your NEW files into the '{new_files_directory}' folder on the left.")
input("👉 Press ENTER once you have uploaded the NEW files...")

def parse_new_files():
    print(f"🚀 Parsing new files from {new_files_directory}...")
    parser = LlamaParse(result_type="markdown", verbose=True, language="en")

    new_docs = []
    files = glob.glob(os.path.join(new_files_directory, "*.pdf"))

    for file in files:
        try:
            documents = parser.load_data(file)
            full_text = "\n\n".join([doc.text for doc in documents])
            new_docs.append({"filename": file, "content": full_text})
            print(f"   ✅ Parsed: {file}")
        except Exception as e:
            print(f"   ❌ Failed: {file} - {e}")

    return new_docs

def process_to_json(raw_docs):
    formatted_entries = []
    CHUNK_SIZE = 1024
    OVERLAP = 100

    for doc in raw_docs:
        text = doc['content']
        filename = doc['filename']
        start = 0
        while start < len(text):
            end = start + CHUNK_SIZE
            chunk = text[start:end]
            formatted_entries.append({
                "instruction": f"You are an expert technical assistant. Provide details from '{filename}'.",
                "input": "",
                "output": chunk
            })
            start = end - OVERLAP
    return formatted_entries

if not os.path.exists(current_dataset_file):
    print(f"⚠️ Warning: Old dataset '{current_dataset_file}' not found. Starting fresh.")
    old_data = []
else:
    with open(current_dataset_file, "r") as f:
        old_data = json.load(f)
    print(f"📚 Loaded {len(old_data)} existing examples.")

raw_new_docs = parse_new_files()
if raw_new_docs:
    new_json_entries = process_to_json(raw_new_docs)

    combined_data = old_data + new_json_entries

    with open(output_file, "w") as f:
        json.dump(combined_data, f, indent=2)

    print(f"\n✅ SUCCESS!")
    print(f"   Old Data: {len(old_data)}")
    print(f"   New Data: {len(new_json_entries)}")
    print(f"   Total Data: {len(combined_data)}")
    print(f"   Saved as: '{output_file}' (Use this for training now)")
else:
    print("No new files processed.")

Instructions: Upload ONLY your NEW files into the 'new_docs' folder on the left.
👉 Press ENTER once you have uploaded the NEW files...
📚 Loaded 150 existing examples.
🚀 Parsing new files from new_docs...
Started parsing the file under job_id 00ce590b-3cc6-4dbb-8e68-587354b33488
   ✅ Parsed: new_docs/Bose Professional Product Research Document.pdf

✅ SUCCESS!
   Old Data: 150
   New Data: 60
   Total Data: 210
   Saved as: 'combined_dataset_v2.json' (Use this for training now)


In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("🧠 Loading Base Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("🛠️ Attaching LoRA Adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

try:
    dataset = load_dataset("json", data_files="combined_dataset_v2.json", split="train")
    print(f"📚 Loaded Combined Dataset with {len(dataset)} examples.")
except:
    print("⚠️ 'combined_dataset_v2.json' not found. Loading 'enhanced_dataset.json' instead.")
    dataset = load_dataset("json", data_files="enhanced_dataset.json", split="train")

dataset = dataset.map(formatting_prompts_func, batched = True,)

print("🚀 Initializing Trainer...")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

trainer_stats = trainer.train()
print("🏆 Training Complete on Combined Data!")

In [None]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl size=4500215 sha256=a30c45743814a1525b

In [None]:
import os
import torch
from unsloth import FastLanguageModel
import subprocess
import shutil

print("🧹 Clearing GPU/CPU Memory...")
import gc
try:
    del model
    del tokenizer
except:
    pass
gc.collect()
torch.cuda.empty_cache()

checkpoint_path = "outputs/checkpoint-60"
merged_model_dir = "final_merged_model"
gguf_filename = "Product_SLM_Llama3_v2.gguf"
quantized_filename = "Product_SLM_Llama3_v2.Q4_K_M.gguf"

print(f"🚀 Starting High-RAM Export using checkpoint: {checkpoint_path}")

try:
    print("\n🔄 Loading & Merging Model to Disk...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = checkpoint_path,
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )

    model.save_pretrained_merged(
        merged_model_dir,
        tokenizer,
        save_method = "merged_16bit",
    )
    print(f"   ✅ Merged model saved to '{merged_model_dir}'")

    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    print("\n🛠️ Setting up official llama.cpp tools...")
    if not os.path.exists("llama.cpp"):
        subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp"], check=True)
        subprocess.run(["make", "-C", "llama.cpp", "clean"], check=True, capture_output=True)
        subprocess.run(["make", "-C", "llama.cpp", "all"], check=True, capture_output=True)
        subprocess.run(["pip", "install", "-r", "llama.cpp/requirements.txt"], check=True, capture_output=True)

    print(f"\n📦 Converting {merged_model_dir} to GGUF (F16)...")
    convert_cmd = [
        "python", "llama.cpp/convert_hf_to_gguf.py",
        merged_model_dir,
        "--outfile", gguf_filename,
        "--outtype", "f16"
    ]
    subprocess.run(convert_cmd, check=True)
    print(f"   ✅ Intermediate GGUF created: {gguf_filename}")

    print(f"\n📉 Quantizing to Q4_K_M...")
    quantize_cmd = [
        "./llama.cpp/llama-quantize",
        gguf_filename,
        quantized_filename,
        "Q4_K_M"
    ]
    subprocess.run(quantize_cmd, check=True)

    if os.path.exists(gguf_filename): os.remove(gguf_filename)
    shutil.rmtree(merged_model_dir, ignore_errors=True)

    print("\n" + "="*50)
    print(f"🏆 SUCCESS! High-RAM Export Complete.")
    print("="*50)
    print(f"File: {quantized_filename}")
    print(f"Size: ~4.9 GB")
    print("\n⬇️  Starting Download (Check your browser)...")

    from google.colab import files
    files.download(quantized_filename)

except Exception as e:
    print(f"\n❌ Error: {e}")
    print("Tip: If this fails, ensure you selected 'High RAM' in Runtime > Change Runtime Type")

🧹 Clearing GPU/CPU Memory...
🚀 Starting High-RAM Export using checkpoint: outputs/checkpoint-60

🔄 Loading & Merging Model to Disk...
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [00:07<00:23,  7.94s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [00:21<00:22, 11.46s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [00:43<00:16, 16.22s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [00:58<00:00, 14.58s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [01:04<00:00, 16.23s/it]


Unsloth: Merge process complete. Saved to `/content/final_merged_model`
   ✅ Merged model saved to 'final_merged_model'

🛠️ Setting up official llama.cpp tools...

❌ Error: Command '['make', '-C', 'llama.cpp', 'clean']' returned non-zero exit status 2.
Tip: If this fails, ensure you selected 'High RAM' in Runtime > Change Runtime Type
