In [None]:
BASE_DIR = '/content/'

In [None]:
# # Install Unsloth and its dependencies with xformers for memory efficiency
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes

Collecting unsloth
  Downloading unsloth-2025.9.5-py3-none-any.whl.metadata (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting unsloth_zoo>=2025.9.6 (from unsloth)
  Downloading unsloth_zoo-2025.9.6-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting cut_cross_entropy (fr

In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import json

# Define model parameters
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load the model and tokenizer from Hugging Face
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.5: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
import os

In [None]:
filename = os.path.join(BASE_DIR, 'full_dataset.json')

print(f"\nSuccessfully uploaded '{filename}'")

# Load the JSON data from the uploaded file
with open(filename, 'r') as f:
    data = json.load(f)

# Quick check to see if data is loaded correctly
print("Number of dialogues in the dataset:", len(data.get('dialogues', [])))



Successfully uploaded '/content/full_dataset.json'
Number of dialogues in the dataset: 100


In [None]:
# Create the formatting function
def format_chat_data(data):
    formatted_dialogues = []
    # Llama-3 chat template from tokenizer
    chat_template = tokenizer.apply_chat_template

    # The JSON file contains a list of dialogues directly
    dialogues_list = data # The root of your JSON is the list

    for dialogue in dialogues_list:
        # Iterate through turns, taking a user turn and the following assistant turn
        for i in range(0, len(dialogue['turns']) - 1, 2):
            if dialogue['turns'][i]['role'] == 'user' and dialogue['turns'][i+1]['role'] == 'assistant':
                user_turn = dialogue['turns'][i]
                assistant_turn = dialogue['turns'][i+1]

                # Combine intervention steps into the assistant's text if they exist
                assistant_text = assistant_turn['text']
                if 'intervention_steps' in assistant_turn:
                    steps_text = "\n".join(assistant_turn['intervention_steps'])
                    assistant_text += "\n" + steps_text

                # Create the message structure for the template
                messages = [
                    {"role": "user", "content": user_turn['text']},
                    {"role": "assistant", "content": assistant_text}
                ]

                # Apply the template and add to our list
                # We set add_generation_prompt=False because we provide the full conversation
                formatted_text = chat_template(messages, tokenize=False, add_generation_prompt=False)
                formatted_dialogues.append({"text": formatted_text})

    return formatted_dialogues

# Process the data and load it into a Hugging Face Dataset
formatted_data = format_chat_data(data['dialogues']) # Pass the list of dialogues
from datasets import Dataset
dataset = Dataset.from_list(formatted_data)

# Let's check one formatted example
print("\nFormatted example:\n", dataset[0]['text'])


Formatted example:
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>

I'm so overwhelmed with my project. I feel like I'm drowning and can't focus on anything.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

That sounds incredibly stressful. It's completely understandable to feel like that when you're under a lot of pressure. I'm here to listen if you want to talk more about it.<|eot_id|>


In [None]:
# Create the formatting function
def format_chat_data(data, tokenizer):
    formatted_dialogues = []

    # The JSON file contains a list of dialogues
    dialogues_list = data

    for dialogue in dialogues_list:
        messages = []

        for turn in dialogue['turns']:
            # Create a user turn with emotion and intent labels
            if turn['role'] == 'user':
                formatted_text = f"(emotion: {turn['emotion']}) (intent: {turn['intent']}) {turn['text']}"
                messages.append({"role": "user", "content": formatted_text})

            # Create an assistant turn with emotion and intent labels
            elif turn['role'] == 'assistant':
                assistant_text = turn['text']

                # Combine intervention steps if they exist
                if 'intervention_steps' in turn:
                    steps_text = "\n".join(turn['intervention_steps'])
                    assistant_text += "\n" + steps_text

                # Add the emotion and intent labels to the assistant's text
                formatted_text = f"(emotion: {turn['emotion']}) (intent: {turn['intent']}) {assistant_text}"
                messages.append({"role": "assistant", "content": formatted_text})

        # Apply the template to the full conversation and add to our list
        # We set add_generation_prompt=False because we provide the full conversation
        formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        formatted_dialogues.append({"text": formatted_text})

    return formatted_dialogues

# Example of how to use it
# You would load your data and tokenizer first
formatted_data = format_chat_data(data['dialogues'], tokenizer)

from datasets import Dataset
dataset = Dataset.from_list(formatted_data)

# Let's check one formatted example
print("\nFormatted example:\n", dataset[0]['text'])


Formatted example:
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>

(emotion: stress) (intent: venting_intent) I'm so overwhelmed with my project. I feel like I'm drowning and can't focus on anything.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

(emotion: calmness) (intent: general_chat_intent) That sounds incredibly stressful. It's completely understandable to feel like that when you're under a lot of pressure. I'm here to listen if you want to talk more about it.<|eot_id|><|start_header_id|>user<|end_header_id|>

(emotion: frustration) (intent: continue_venting_intent) Yeah, and my boss just keeps adding more tasks. I just don't know where to start or how I'm going to get it all done.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

(emotion: calmness) (intent: general_chat_intent) It sounds really frustrating when you have so much on your plate. Can you tell me more about what's making it feel so difficult right now?<|eot_id|><|start_header_id|>user<|en

In [None]:
len(dataset)

100

In [None]:
# The new, more flexible formatting function
def format_chat_data(data, tokenizer):
    formatted_dialogues = []
    dialogues_list = data

    for dialogue in dialogues_list:
        messages = []
        for turn in dialogue['turns']:
            # Start with an empty prefix for our special tags
            prefix = ""

            # Conditionally add emotion if it exists
            if 'emotion' in turn:
                prefix += f"(emotion: {turn['emotion']}) "

            # Conditionally add intent if it exists
            if 'intent' in turn:
                prefix += f"(intent: {turn['intent']}) "

            # Get the main text of the turn
            main_text = turn['text']

            # For assistant turns, append intervention steps if they exist
            if turn['role'] == 'assistant' and 'intervention_steps' in turn:
                steps_text = "\n".join(turn['intervention_steps'])
                main_text += "\n" + steps_text

            # Combine the prefix and the main text
            formatted_content = f"{prefix}{main_text}".strip()

            messages.append({"role": turn['role'], "content": formatted_content})

        # Apply the chat template to the entire conversation
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        formatted_dialogues.append({"text": formatted_text})

    return formatted_dialogues

In [None]:
import json
from datasets import Dataset

# Step 1: Load both datasets
with open('full_dataset.json', 'r') as f:
    well_being_data = json.load(f)
with open('personal_info_dataset.json', 'r') as f:
    persona_data = json.load(f)

well_being_data = well_being_data
persona_data = persona_data


# Step 2: Combine the dialogues from both files into one list
all_dialogues = well_being_data['dialogues'] + persona_data['dialogues']
print(f"Total dialogues to be trained on: {len(all_dialogues)}")


# Step 3: Process the combined list with our new, robust function
# (Make sure to pass your actual tokenizer object here)
formatted_data = format_chat_data(all_dialogues, tokenizer)
final_dataset = Dataset.from_list(formatted_data)


# --- Verification ---
# Let's check that both types of data were formatted correctly

print("\n--- Formatted Well-being Example ---")
# The well-being example should have both emotion and intent tags
print(final_dataset[0]['text'])

print("\n--- Formatted Persona Example ---")
# The persona example should only have the intent tag for the user
# and no tags for the assistant
print(final_dataset[101]['text'])

Total dialogues to be trained on: 115

--- Formatted Well-being Example ---
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

(emotion: stress) (intent: venting_intent) I'm so overwhelmed with my project. I feel like I'm drowning and can't focus on anything.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

(emotion: calmness) (intent: general_chat_intent) That sounds incredibly stressful. It's completely understandable to feel like that when you're under a lot of pressure. I'm here to listen if you want to talk more about it.<|eot_id|><|start_header_id|>user<|end_header_id|>

(emotion: frustration) (intent: continue_venting_intent) Yeah, and my boss just keeps adding more tasks. I just don't know where to start or how I'm going to get it all done.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

(emotion: calmness) (intent: general_chat_intent) It sounds really frustrating when you have so much on your plate. Can you tell me more about what's making it feel so di

In [None]:
# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0. Suggested values are 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


Unsloth 2025.9.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Configure and run the trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # This is the line to add/change!**
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Start training!
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 5 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0619
2,2.1523
3,2.0448
4,1.9544
5,1.6784
6,1.4132
7,1.2612
8,1.1381
9,1.0433
10,0.921


In [None]:
# Create an inference prompt
messages = [
    {"role": "user", "content": "I feel really overwhelmed and alone. It feels like nobody understands."}
]

# Unsloth's fast inference pipeline
# We must use the chat template!
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

outputs = model.generate(input_ids=input_ids, max_new_tokens=128, use_cache=True)
response = tokenizer.batch_decode(outputs)

print("Fine-tuned model's response:\n", response[0].split("<|end_header_id|>\n\n")[2].replace("<|eot_id|>", "").strip())

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Fine-tuned model's response:
 It's completely valid to feel this way. It sounds like you're carrying a heavy weight and it's okay to feel that way. I'm here to listen if you want to talk more about it.


In [None]:
# Make sure llama-cpp-python is installed for GGUF conversion
# !pip install llama-cpp-python

In [None]:
# # Manually compile llama.cpp
# # This might take a few minutes
# !git clone --recursive https://github.com/ggerganov/llama.cpp
# %cd llama.cpp
# !make clean && make all -j
# %cd /content/

In [None]:
# # Manually compile llama.cpp using CMake
# # This might take a few minutes
# !git clone --recursive https://github.com/ggerganov/llama.cpp
# %cd llama.cpp
# !mkdir build
# %cd build
# !cmake ..
# !cmake --build . --config Release -j
# %cd /content/

In [None]:
# Merge the LoRA adapters and save to GGUF
# You can choose different quantization methods. q4_k_m is a great balance of size and performance.
# model.save_pretrained_gguf("llama3-8b-sakinah2-gguf", tokenizer, quantization_method = "q4_k_m")

# This will create a file like:
# llama3-8b-therapist-gguf/ggml-model-q4_k_m.gguf

In [None]:
# Instead of save_pretrained_gguf, use this:
# This merges the adapters and saves the full model in 16-bit precision.
merged_model_path = "llama3_therapist_merged"
model.save_pretrained_merged(merged_model_path, tokenizer, save_method = "merged_16bit")

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

DEBUG: Removed incorrectly saved weight file: model-00002-of-00002.safetensors
DEBUG: Removed incorrectly saved weight file: model-00001-of-00002.safetensors
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [32:57<1:38:53, 1977.87s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [46:56<43:35, 1307.99s/it]  

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [1:01:31<18:30, 1110.16s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [1:04:34<00:00, 968.64s/it]


Converting the model into gguf

In [None]:
# Clean up any old attempts
# !rm -rf llama.cpp

# Clone the repository (using --depth 1 for a clean, lightweight clone)
print("⬇️ Cloning llama.cpp repository...")
!git clone --depth 1 https://github.com/ggerganov/llama.cpp

# List ALL files recursively to find the converter
print("\n✅ Clone complete. Now searching for the conversion script...")
print("----------------------------------------------------------------")
print("Look through the list below for a file named 'convert-hf-to-gguf.py' or 'convert.py'.")
print("Copy its FULL PATH (e.g., /content/llama.cpp/convert-hf-to-gguf.py)")
print("----------------------------------------------------------------\n")

!ls -R /content/llama.cpp

⬇️ Cloning llama.cpp repository...
Cloning into 'llama.cpp'...
remote: Enumerating objects: 1652, done.[K
remote: Counting objects: 100% (1652/1652), done.[K
remote: Compressing objects: 100% (1281/1281), done.[K
remote: Total 1652 (delta 346), reused 1067 (delta 315), pack-reused 0 (from 0)[K
Receiving objects: 100% (1652/1652), 24.18 MiB | 20.18 MiB/s, done.
Resolving deltas: 100% (346/346), done.

✅ Clone complete. Now searching for the conversion script...
----------------------------------------------------------------
Look through the list below for a file named 'convert-hf-to-gguf.py' or 'convert.py'.
Copy its FULL PATH (e.g., /content/llama.cpp/convert-hf-to-gguf.py)
----------------------------------------------------------------

/content/llama.cpp:
AUTHORS			       examples    poetry.lock
build-xcframework.sh	       flake.lock  prompts
ci			       flake.nix   pyproject.toml
cmake			       ggml	   pyrightconfig.json
CMakeLists.txt		       gguf-py	   README.md
CMakePresets

In [None]:
!pip install mistral_common

Collecting mistral_common
  Downloading mistral_common-1.8.5-py3-none-any.whl.metadata (5.1 kB)
Collecting pydantic-extra-types>=2.10.5 (from pydantic-extra-types[pycountry]>=2.10.5->mistral_common)
  Downloading pydantic_extra_types-2.10.5-py3-none-any.whl.metadata (3.9 kB)
Collecting pycountry>=23 (from pydantic-extra-types[pycountry]>=2.10.5->mistral_common)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading mistral_common-1.8.5-py3-none-any.whl (6.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_extra_types-2.10.5-py3-none-any.whl (38 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m127.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry, pydantic-extra-types, mistral_common
Successfully installed mistral_common-1.8.5 pycountry-24.6.1 pydan

In [None]:
converter_script_path = "/content/llama.cpp/convert_hf_to_gguf.py"

import os
# This check will stop the script if the path is wrong.
if not os.path.exists(converter_script_path):
    raise FileNotFoundError(
        f"CRITICAL ERROR: The script was not found at the path you provided: '{converter_script_path}'. "
        "Please re-run Step 1, find the correct path, and paste it above."
    )

print(f"✅ Using converter script found at: {converter_script_path}")
print("\n🔄 Converting model to GGUF F16 format...")

# We use the variable to run the script from its confirmed location
!python {converter_script_path} llama3_therapist_merged --outfile llama3-therapist.F16.gguf --outtype f16

✅ Using converter script found at: /content/llama.cpp/convert_hf_to_gguf.py

🔄 Converting model to GGUF F16 format...
INFO:hf-to-gguf:Loading model: llama3_therapist_merged
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32,

Quantizing the model

In [None]:
# # Clone the repository
# !git clone https://github.com/ggerganov/llama.cpp

# Build the tools using CMake
!cd llama.cpp && mkdir -p build && cd build && cmake .. && cmake --build . --config Release

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found ve

In [None]:
# This command takes the 16 GB file as input and outputs the final ~4 GB file
!./llama.cpp/build/bin/llama-quantize ./llama3-therapist.F16.gguf ./llama3-therapist.Q4_K_M.gguf q4_k_m

main: build = 1 (8ff2060)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0 for x86_64-linux-gnu
main: quantizing './llama3-therapist.F16.gguf' to './llama3-therapist.Q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 29 key-value pairs and 291 tensors from ./llama3-therapist.F16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama3_Therapist_Merged
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                       llama.con