In [3]:
import sys
import torch
print(f"Python Executable: {sys.executable}")
print(f"Torch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

Python Executable: c:\Users\ruben\Documents\TrainingAI\venv\Scripts\python.exe
Torch Version: 2.5.1+cu121
CUDA Available: True


In [None]:
import torch
import json
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import os

# Configuration
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
NEW_MODEL_NAME = "Llama-3-8B-Gloom-Lore"
HF_TOKEN = "..." # <--- PASTE YOUR NEW TOKEN HERE
DATA_FILE = "lore_training_data.json"  # <--- This was missing

# Set up torch to use GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Login to Hugging Face programmatically
from huggingface_hub import login
login(token=HF_TOKEN)

Using device: cuda


In [6]:
# 1. Load Data from JSON file
try:
    with open(DATA_FILE, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Successfully loaded {len(data)} examples from {DATA_FILE}")
except FileNotFoundError:
    raise FileNotFoundError(f"Could not find {DATA_FILE}. Make sure it is in the same folder as this notebook.")

# 2. Convert to Hugging Face Dataset object
dataset = Dataset.from_list(data)

# 3. Define Llama 3 Prompt Format
def format_llama3_prompts(examples):
    texts = []
    for instruction, response in zip(examples['instruction'], examples['response']):
        # Exact Llama 3 Instruct format
        prompt = (
            f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
            f"You are a gritty shopkeeper on a submarine in a sci-fi horror setting.<|eot_id|>"
            f"<|start_header_id|>user<|end_header_id|>\n\n"
            f"{instruction}<|eot_id|>"
            f"<|start_header_id|>assistant<|end_header_id|>\n\n"
            f"{response}<|eot_id|>"
        )
        texts.append(prompt)
    return {'text': texts}

# 4. Map the formatting to the dataset
dataset = dataset.map(format_llama3_prompts, batched=True)

# Show an example to verify
print("\nSample Prompt:\n")
print(dataset[0]['text'])

Successfully loaded 7 examples from lore_training_data.json


Map:   0%|          | 0/7 [00:00<?, ? examples/s]


Sample Prompt:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a gritty shopkeeper on a submarine in a sci-fi horror setting.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the Gloom?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The Gloom is the lifeblood and the curse of these depths. It's an energy, a presence... something ancient. It gives us the crystals we seek, but it also births the horrors that hunt us. Don't stare into it too long; it stares back.<|eot_id|>


In [7]:
# 4-bit Quantization Config (Crucial for 12GB VRAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load Base Model
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    token=HF_TOKEN
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# Initialize Trainer
# Since we downgraded to trl==0.8.6, this original syntax works perfectly again.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text", # explicit column name
    max_seq_length=512,        # explicit sequence length
    tokenizer=tokenizer,       # we can use 'tokenizer' again!
    args=training_arguments,
    packing=False,
)

NameError: name 'peft_config' is not defined

In [None]:
print("Starting training...")
trainer.train()
print("Training finished.")

# Save the adapter locally
trainer.model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)
print(f"LoRA adapters saved to folder: {NEW_MODEL_NAME}")

In [None]:
import gc
del model
del trainer
gc.collect()
torch.cuda.empty_cache()
print("VRAM cleared.")

In [None]:
from peft import PeftModel

print("Loading model on CPU for merging (this prevents VRAM errors)...")
# Load base model in FP16 on CPU
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
    token=HF_TOKEN
)

print("Applying LoRA adapter...")
model_to_merge = PeftModel.from_pretrained(base_model, NEW_MODEL_NAME)

print("Merging weights...")
merged_model = model_to_merge.merge_and_unload()

# Save the merged model
output_merged_dir = "merged_model"
merged_model.save_pretrained(output_merged_dir, safe_serialization=True)
tokenizer.save_pretrained(output_merged_dir)
print(f"Full merged model saved to: {output_merged_dir}")

In [None]:
# Define paths
llama_cpp_path = os.path.abspath("./llama.cpp") 
convert_script = os.path.join(llama_cpp_path, "convert_hf_to_gguf.py")
model_path = os.path.abspath("./merged_model")
outfile_path = f"{NEW_MODEL_NAME}.fp16.gguf"

# Check if script exists
if not os.path.exists(convert_script):
    print("Error: convert_hf_to_gguf.py not found.")
    print("Please ensure you have the llama.cpp SOURCE code folder named 'llama.cpp' in this directory.")
else:
    # Run conversion
    command = f'python "{convert_script}" "{model_path}" --outtype f16 --outfile "{outfile_path}"'
    print(f"Running conversion command...")
    exit_code = os.system(command)
    
    if exit_code == 0:
        print(f"Success! Created {outfile_path}")
    else:
        print("Conversion failed.")

In [None]:
# Define paths
input_gguf = f"{NEW_MODEL_NAME}.fp16.gguf"
output_gguf = f"{NEW_MODEL_NAME}.Q4_K_M.gguf"
quantize_exe = "llama-quantize.exe" 

if os.path.exists(quantize_exe):
    print(f"Quantizing {input_gguf} to {output_gguf}...")
    command = f'{quantize_exe} "{input_gguf}" "{output_gguf}" Q4_K_M'
    os.system(command)
    print("Done! You can now use the Q4_K_M.gguf file in LM Studio or Ollama.")
else:
    print("Error: llama-quantize.exe not found.")
    print("Please download the 'bin-win-cuda-x64' zip from llama.cpp releases and extract the exe here.")