In [2]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-bq5t_0lt/unsloth_f42365ebf4194148853abe7311eeefa2
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-bq5t_0lt/unsloth_f42365ebf4194148853abe7311eeefa2
  Resolved https://github.com/unslothai/unsloth.git to commit 8d9bd0ea8bf662618ba96fe7fe3478c5b81d0dff
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Using cached tyro-0.8.4-py3-none-any.whl.metadata (7.9 kB)
Collecting sentencepiece (from unsloth@ git+ht

In [3]:
from unsloth import FastLanguageModel
import torch
import os
import json
from datasets import load_dataset, Dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer
import requests
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
# Settings
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.811 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="phi-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
)

def formatting_prompts_func(batch):
    def format_conversation(example):
        return [
            {"from": "human", "value": example['user_input']},
            {"from": "gpt", "value": example['assistant_response']}
        ]

    texts = [tokenizer.apply_chat_template(format_conversation(example), tokenize=False, add_generation_prompt=False) for example in batch["conversations"]]
    return {"text": texts}

# Load your dataset from JSON file
def load_dataset_from_json(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return Dataset.from_dict({"conversations": data})

# Load dataset
dataset_path = 'weather_chatbot_dataset.json'
dataset = load_dataset_from_json(dataset_path)

# Format the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)


Map: 100%|██████████████████████████| 376/376 [00:00<00:00, 13739.48 examples/s]


In [18]:
dataset[0]

{'conversations': {'api_response': {'date': None,
   'description': 'clear sky',
   'humidity': 65,
   'location': 'Warsaw, PL',
   'temperature': 18.25,
   'wind_speed': 3.6},
  'assistant_response': 'The weather in Warsaw is currently clear sky with a temperature of 18.25°C, wind speed of 3.6 meters per second, and humidity of 65%.',
  'intent_extraction': {'entities': {'city': 'Warsaw', 'date': 'today'},
   'intent': 'current_weather'},
  'user_input': 'What is the current weather in Warsaw today?'},
 'text': '<s><|user|>\nWhat is the current weather in Warsaw today?<|end|>\n<|assistant|>\nThe weather in Warsaw is currently clear sky with a temperature of 18.25°C, wind speed of 3.6 meters per second, and humidity of 65%.<|end|>\n'}

In [19]:
training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 60,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
)

In [20]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args,
)

Map (num_proc=2): 100%|██████████████| 376/376 [00:00<00:00, 1676.71 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [21]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory = 3.811 GB.
2.283 GB of memory reserved.


In [22]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 376 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,3.1449
2,3.0386
3,3.1049
4,2.9992
5,2.5628
6,2.1958
7,1.8376
8,1.432
9,1.2966
10,1.1729


In [23]:
# Final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


157.4404 seconds used for training.
2.62 minutes used for training.
Peak reserved memory = 2.598 GB.
Peak reserved memory for training = 0.315 GB.
Peak reserved memory % of max memory = 68.171 %.
Peak reserved memory for training % of max memory = 8.266 %.


In [33]:
# # Inference
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "What is the weather in New York tomorrow?"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
# print(tokenizer.batch_decode(outputs))


In [34]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Define your weather API key
api_key = '337586e7326dcb828d7a386379093040'

# Function to get current weather
def get_weather(city):
    url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

# Function to get weather forecast
def get_forecast(city):
    url = f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

# User input
user_input = "What is the weather in New York tomorrow?"

# Step 1: Intent Identification
messages = [{"from": "human", "value": user_input}]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

In [40]:
# Generate intent and entity extraction
intent_output = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)
intent_response = tokenizer.batch_decode(intent_output, skip_special_tokens=True)[0]

print("Identified Intent and Entities:", intent_response)

Identified Intent and Entities: Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8, The next number in the Fibonacci sequence is 13.


In [41]:
import json
intent_data = json.loads(intent_response)

intent = intent_data.get("intent")
city = intent_data.get("entities", {}).get("city")
date = intent_data.get("entities", {}).get("date")

# Step 2: Fetch weather data from API
if intent == "current_weather":
    weather_data = get_weather(city)
elif intent == "forecast_weather":
    weather_data = get_forecast(city)

# Generate the response using the weather data
if weather_data:
    weather_desc = weather_data['weather'][0]['description']
    temp = weather_data['main']['temp']
    wind_speed = weather_data['wind']['speed']
    humidity = weather_data['main']['humidity']

    final_response = (
        f"The weather in {city} is currently {weather_desc} with a temperature of {temp}°C, "
        f"wind speed of {wind_speed} meters per second, and humidity of {humidity}%."
    )
else:
    final_response = "Sorry, I couldn't fetch the weather information."

print("Final Response:", final_response)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [42]:
# Step 3: Generate the final assistant response
messages = [
    {"from": "human", "value": user_input},
    {"from": "gpt", "value": final_response}
]

NameError: name 'final_response' is not defined

In [43]:
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")


In [44]:
# Generate the assistant's response
outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)
response_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Assistant Response:", response_text)

# Continuous inference example
messages = [
    {"from": "human", "value": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)


Assistant Response: Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8, The next number in the Fibonacci sequence is 13.
<s><|user|> Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,<|end|><|assistant|> The next number in the Fibonacci sequence is 13.<|end|>


In [45]:
# model.push_to_hub("your_name/lora_model", token = "...")

In [46]:
# # Continuous inference
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

# # Save the model

In [28]:
from huggingface_hub import login

In [30]:
# login(token='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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/vatsal-patel/.cache/huggingface/token
Login successful


In [31]:
from huggingface_hub import create_repo, upload_file

In [32]:
# Set your repository details
repo_name = "phi3-mini-WeatherBot"
repo_id = f"VatsalPatel18/{repo_name}"
local_dir = "./unsloth/Phi-3-mini-4k-instruct"

# Authenticate using the token from the environment variable
token = os.getenv("HUGGINGFACE_HUB_TOKEN")

if not token:
    raise ValueError("The Hugging Face token is not set in the environment variable.")

# # Initialize the HfApi object
# api = HfApi()

# Create the repository if it doesn't exist
create_repo(repo_id, token=token, exist_ok=True)

# Upload each file in the local directory
for root, dirs, files in os.walk(local_dir):
    for file in files:
        if file.startswith("."):
            # Skip hidden files and folders
            continue
        local_file_path = os.path.join(root, file)
        repo_file_path = os.path.relpath(local_file_path, local_dir)
        upload_file(
            path_or_fileobj=local_file_path,
            path_in_repo=repo_file_path,
            repo_id=repo_id,
            token=token
        )

print("Files uploaded successfully.")

model-00002-of-00002.safetensors:   1%|    | 36.2M/3.64G [00:08<09:02, 6.65MB/s]Exception ignored in: <function Dataset.__del__ at 0x70e655b3a320>
Traceback (most recent call last):
  File "/home/vatsal-patel/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 1421, in __del__
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

# Saving to GGUF format
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")