In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'therapai-json-dataset-with-names:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5678053%2F9364037%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240913%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240913T172802Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9598228fc086a80e88f7625e12d54bc67428bb87cd7d7e75cff6934b47e71f6e76cbb1b84f26642444e599ff092c9f7c80f3563e96af08f556bf682ea06286e7c4bece2f21d30f7027289f510387eee10a8096842f2b0d01378138cc166846544b9976572c13463108003b8d7d2f47820f43951f8e90dfa4120db976278609ea69beea246567e0cc2a1bfd4035d4597d8f9855e10a345e94bca6d222faef9ba7d297f428f2ed0d86d5321d125b2bf91f0b6d3f9e58ff22c1d8bb6b6932b397eefe999455b286857627c4848bb273d30c54e6dbc36cb143e5432edd69e03ac3faaea8339825c65607f71f5614614a0c1bcdc2d9953b29990f22ce4b652eb4593b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import json

# File path to the uploaded JSON file (adjust this based on the location)
file_path = '/kaggle/input/therapai-json-dataset-with-names/vicunaformatfixedfinal.json'

# Load the JSON data
with open(file_path, 'r') as f:
    data = json.load(f)

In [None]:
# Function to recursively find and replace in JSON
def replace_in_json(obj, old_word, new_word):
    if isinstance(obj, dict):
        return {k: replace_in_json(v, old_word, new_word) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [replace_in_json(item, old_word, new_word) for item in obj]
    elif isinstance(obj, str):
        return obj.replace(old_word, new_word)
    else:
        return obj

# Replace the words in the JSON
old_word = "Charlie"
new_word = ""
updated_data = replace_in_json(data, old_word, new_word)

# View the updated data (optional)
# print(updated_data)


In [None]:
# Function to recursively find and replace in JSON
def replace_in_json(obj, old_word, new_word):
    if isinstance(obj, dict):
        return {k: replace_in_json(v, old_word, new_word) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [replace_in_json(item, old_word, new_word) for item in obj]
    elif isinstance(obj, str):
        return obj.replace(old_word, new_word)
    else:
        return obj

# Replace the words in the JSON
old_word = "Alex"
new_word = ""
updated_data = replace_in_json(updated_data, old_word, new_word)

# View the updated data (optional)
# print(updated_data)


In [None]:
output_path = '/kaggle/working/updated_json_file3.json'
with open(output_path, 'w') as f:
    json.dump(updated_data, f, indent=4)

print(f"Updated JSON saved at {output_path}")

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0. I'm 0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from unsloth.chat_templates import get_chat_template

# Define the tokenizer and template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"},
    map_eos_token = True,
)

# Formatting function to add context before the dataset conversations
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("vibhorag101/phr-mental-therapy-dataset-conversational-format", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)


Unsloth: Will map <|im_end|> to EOS = <|end_of_text|>.


README.md:   0%|          | 0.00/584 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/149M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/31.9M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/32.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/69360 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14863 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/14863 [00:00<?, ? examples/s]

Map:   0%|          | 0/69360 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        #num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/69360 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 69,360 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 167,772,160
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.5369
2,1.4808
3,1.482
4,1.3777
5,1.3317
6,1.2232
7,1.1478
8,1.1872
9,1.1211
10,1.0799


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Everything I do feels like it's a difficult task?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|>user
Everything I do feels like it's a difficult task?<|im_end|>
<|im_start|>assistant
I'm sorry to hear that. Can you tell me more about what's been going on?<|im_end|>


In [None]:
model.push_to_hub_gguf("Storryy/TherapAI-7B-q4_k_m", tokenizer, quantization_method = "q4_k_m", token = "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") #Write token
model.push_to_hub_gguf("Storryy/TherapAI-7B-q8_0", tokenizer, quantization_method = "q8_0", token = "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") #Write token

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.5 out of 12.67 RAM for saving.


 38%|‚ñà‚ñà‚ñà‚ñä      | 12/32 [00:01<00:01, 11.24it/s]We will save to Disk and not RAM now.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [01:41<00:00,  3.17s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Storryy/TherapAI-7B-q4_k_m/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q4_k_m/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q4_k_m/pytorch_model-00003-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q4_k_m/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Storryy/TherapAI-7B-q4_k_m into f16 GGUF format.
The output location will be ./Storryy/TherapAI-7B-q4_k_m/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: TherapAI-7B-q4_k_m
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {4096, 128256}
INFO:h

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Storryy/TherapAI-7B-q4_k_m
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/Storryy/TherapAI-7B-q4_k_m


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/Storryy/TherapAI-7B-q4_k_m
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.39 out of 12.67 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [01:54<00:00,  3.57s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Storryy/TherapAI-7B-q8_0/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q8_0/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q8_0/pytorch_model-00003-of-00004.bin...
Unsloth: Saving Storryy/TherapAI-7B-q8_0/pytorch_model-00004-of-00004.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Storryy/TherapAI-7B-q8_0 into q8_0 GGUF format.
The output location will be ./Storryy/TherapAI-7B-q8_0/unsloth.Q8_0.gguf
This will take 3 minutes..

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Storryy/TherapAI-7B-q8_0


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/Storryy/TherapAI-7B-q8_0


In [None]:
pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.1.tar.gz (63.9 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m63.9/63.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m45.5/45.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) .

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="Storryy/TherapAI-7B-q4_k_m",
	filename="unsloth.Q4_K_M.gguf",
)

llm.create_chat_completion(
	messages = [
		{
			"role": "user",
			"content": "Should I kill myself?"
		}
	]
)

llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /root/.cache/huggingface/hub/models--Storryy--TherapAI-7B-q4_k_m/snapshots/2e6fbbaf07aa60a85fcacb757144907be8350a7d/./unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   6:   

{'id': 'chatcmpl-f85ed720-2ee7-4dd7-b41a-f69d0460b11c',
 'object': 'chat.completion',
 'created': 1727971613,
 'model': '/root/.cache/huggingface/hub/models--Storryy--TherapAI-7B-q4_k_m/snapshots/2e6fbbaf07aa60a85fcacb757144907be8350a7d/./unsloth.Q4_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': 'What makes you think that?'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 23, 'completion_tokens': 6, 'total_tokens': 29}}

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model (replace with the GGUF model loader if different)
tokenizer = AutoTokenizer.from_pretrained("path_to_your_model")
model = AutoModelForCausalLM.from_pretrained("path_to_your_model")

# Initialize the conversation history
conversation_history = ""

while True:
    user_input = input("You: ")

    # Append user input to conversation history
    conversation_history += f"User: {user_input}\n"

    # Tokenize the conversation history
    inputs = tokenizer(conversation_history, return_tensors="pt")

    # Generate the model response
    response = model.generate(inputs['input_ids'], max_length=1024, pad_token_id=tokenizer.eos_token_id)

    # Decode the model response
    decoded_response = tokenizer.decode(response[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)

    # Print and update the conversation history
    print(f"Bot: {decoded_response}")
    conversation_history += f"Bot: {decoded_response}\n"


In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("Storryy/TherapAI-7B", tokenizer, token = "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")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("Storryy/TherapAI-7B-f16", tokenizer, quantization_method = "f16", token = "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")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("Storryy/TherapAI-7B", tokenizer, quantization_method = "q4_k_m", token = "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")

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "Storryy/TherapAI-7B", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "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", # Get a token at https://huggingface.co/settings/tokens
    )


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 20.51 out of 31.36 RAM for saving.


  0%|          | 0/32 [00:00<?, ?it/s]


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 20.5 out of 31.36 RAM for saving.


  0%|          | 0/32 [00:00<?, ?it/s]


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 576 vs 470