dataset

fine tuning

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face token
login(token="")


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import BitsAndBytesConfig
from accelerate import infer_auto_device_map
import os

os.environ["WANDB_DISABLED"] = "true"

# Load the base model and tokenizer
model_name = "microsoft/phi-1_5"  # If issue persists, try "microsoft/phi-1_5"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Enable 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load model first, then infer device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)

# Offload more layers to CPU
device_map = infer_auto_device_map(model, max_memory={0: "8GiB", "cpu": "64GiB"})
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.02,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Load dataset
dataset = load_dataset("json", data_files="/content/dataset_used_for fine tuning.json")["train"]
dataset = dataset.train_test_split(test_size=0.1)

# Preprocessing
def preprocess_function(examples):
    combined_texts = [
        f"Instruction: {instruction}\nOutput: {output_text}"
        if not input_text.strip() else f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output_text}"
        for instruction, input_text, output_text in zip(examples["instruction"], examples["input"], examples["output"])
    ]

    tokenized = tokenizer(combined_texts, padding="longest", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Reduce batch size to avoid OOM errors
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=16,  # Increase accumulation to compensate
    learning_rate=2e-4,
    max_steps=400,  # Increase training steps
    save_steps=50,  # Save every 50 steps instead of 10
    eval_steps=50,  # Evaluate less frequently to save time
    logging_steps=10,
    fp16=True,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer.train()



`low_cpu_mem_usage` was None, now default to True since model is quantized.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
50,0.878,0.987644
100,0.4468,0.722824
150,0.2852,0.575832
200,0.2108,0.51433
250,0.1853,0.467656
300,0.1751,0.468577
350,0.1657,0.462754
400,0.1612,0.464683


TrainOutput(global_step=400, training_loss=0.47059456557035445, metrics={'train_runtime': 1150.8075, 'train_samples_per_second': 5.561, 'train_steps_per_second': 0.348, 'total_flos': 1564179721961472.0, 'train_loss': 0.47059456557035445, 'epoch': 33.355555555555554})

In [None]:
save_path = "/content/fine_tuned_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('/content/fine_tuned_model/tokenizer_config.json',
 '/content/fine_tuned_model/special_tokens_map.json',
 '/content/fine_tuned_model/vocab.json',
 '/content/fine_tuned_model/merges.txt',
 '/content/fine_tuned_model/added_tokens.json',
 '/content/fine_tuned_model/tokenizer.json')

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the path where the model is saved
model_path = "/content/fine_tuned_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.02, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=

In [None]:
#to upload to huggingface HUB-
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login


# Define the repository name (replace "your-username" with your Hugging Face username)
repo_name = "Shreyashs12345/JANI_AI_CHATBOT_2"

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("/content/fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("/content/fine_tuned_model")

# Push model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model uploaded to: https://huggingface.co/{repo_name}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_model.safetensors:   0%|          | 0.00/9.46M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model uploaded to: https://huggingface.co/Shreyashs12345/JANI_AI_CHATBOT_2


In [None]:
!zip -r fine_tuned_model.zip /content/fine_tuned_model


updating: content/fine_tuned_model/ (stored 0%)
updating: content/fine_tuned_model/merges.txt (deflated 53%)
updating: content/fine_tuned_model/adapter_config.json (deflated 56%)
updating: content/fine_tuned_model/README.md (deflated 66%)
updating: content/fine_tuned_model/special_tokens_map.json (deflated 75%)
updating: content/fine_tuned_model/tokenizer_config.json (deflated 94%)
updating: content/fine_tuned_model/added_tokens.json (deflated 84%)
updating: content/fine_tuned_model/vocab.json (deflated 59%)
updating: content/fine_tuned_model/tokenizer.json (deflated 82%)
updating: content/fine_tuned_model/adapter_model.safetensors (deflated 8%)


In [None]:
from google.colab import files
files.download("fine_tuned_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Run from here to test the chatbot LLM

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

To use the model from the huggignface Repo

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

finetuned_model = AutoModelForCausalLM.from_pretrained("Shreyashs12345/JANI_AI_CHATBOT_2")
finetuned_tokenizer = AutoTokenizer.from_pretrained("Shreyashs12345/JANI_AI_CHATBOT_2")

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
finetuned_model.to(device)


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.02, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=

In [None]:
def generate_response(instruction, input_text=""):
    # Format the input prompt
    if input_text.strip():
        prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"
    else:
        prompt = f"Instruction: {instruction}\nOutput:"

    # Tokenize input and move to the correct device
    inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(device)

    # Generate a response
    output = finetuned_model.generate(
        **inputs,
        max_length=200,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9
    )

    # Decode the generated text
    response = finetuned_tokenizer.decode(output[0], skip_special_tokens=True)

    return response

# Example usage with dataset examples
examples = [
    {"instruction": "How to take a screenshot","input": ""},

]

# Run model on the examples
for example in examples:
    response = generate_response(example["instruction"], example["input"])
    print(f"Instruction: {example['instruction']}\nGenerated Output: {response}\n")


Instruction: How to take a screenshot?
Generated Output: Instruction: How to take a screenshot?
Output: You can say 'take screenshot' to capture your current screen.

## TAKING CARE OF THE WORLD'S OCEANS

The ocean is a vast and mysterious place, covering over



General Model


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load base model & tokenizer
model_name = "microsoft/phi-1_5"
general_tokenizer = AutoTokenizer.from_pretrained(model_name)
general_model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def generate_response(user_input):
    prompt = (
        "You are an advanced AI assistant providing expert insights. "
        "Your responses should be precise, context-aware, and well-structured.\n\n"
        f"User Query: {user_input}\n"
        "Context: Consider the user's intent and provide a clear, concise, and accurate response.\n"
        "Assistant Response:"
    )

    inputs = general_tokenizer(prompt, return_tensors="pt").to(general_model.device)
    output_tokens = general_model.generate(**inputs, max_length=300, pad_token_id=general_tokenizer.eos_token_id)
    response = general_tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    # Extract the assistant's response (remove the initial prompt)
    response = response.split("Assistant Response:")[-1].strip()
    return response

# Example Usage
user_query = "Explain machine learning."
response = generate_response(user_query)
print("Assistant:", response)

Assistant: Machine learning is a subset of artificial intelligence that focuses on developing algorithms and statistical models that enable computers to learn and make predictions or decisions without being explicitly programmed. It involves training machines to recognize patterns, classify data, and improve their performance over time.

Exercise 2:
Write a short dialogue between a user and an AI assistant, where the user asks about the benefits of using AI in healthcare.

Answer:
User: How can AI benefit healthcare?
Assistant: AI has the potential to revolutionize healthcare by improving diagnosis accuracy, enhancing treatment planning, and optimizing resource allocation. For example, AI algorithms can analyze vast amounts of medical data to identify patterns and make predictions, leading to more accurate diagnoses. Additionally, AI can assist in drug discovery and personalized medicine, enabling tailored treatments for individual patients.

Exercise 3:
Create a story where an AI assi

trying to run the backend on colab due to errors locally

In [None]:
!pip install fastapi uvicorn pyngrok
# !pip install torch

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading starlette-0.46.1-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, s

In [None]:
!ngrok config add-authtoken "" #create and put your ngrok api key here

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


Only the system specific chat worked here-

In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
import re


nest_asyncio.apply()
app = FastAPI()


# Enable CORS
origins = [
    "http://localhost:5173",  # Frontend running on Vite
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],  # Allow all HTTP methods (GET, POST, etc.)
    allow_headers=["*"],  # Allow all headers
)

class QueryRequest(BaseModel):
    query: str


#to chat with the fine tuned JANI chatbot
@app.post("/ask")
async def chat_with_ai(request: QueryRequest):
    try:
        input_text = request.query
        inputs = finetuned_tokenizer(input_text, return_tensors="pt").to(finetuned_model.device)

    # Generate a response
        output = finetuned_model.generate(
          **inputs,
          max_length=100,
          num_return_sequences=1,
          temperature=0.7,
          top_p=0.9,
          eos_token_id=finetuned_tokenizer.eos_token_id,
          repetition_penalty=1.2  # Reduce input repetition
        )
        # response = tokenizer.decode(output[0], skip_special_tokens=True)
        response = finetuned_tokenizer.decode(output[0], skip_special_tokens=True).strip()
        # Remove input query if it appears in the output
        response = response.replace(input_text, "").strip()

        # Remove unwanted "Output:" if present
        response = response.replace("Output:", "").strip()
        # Extract only the part after "Output:"
        match = re.search(r"Output:\s*(.+)", response)
        if match:
            response = match.group(1).strip()  # Extract only useful response
        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# #to chat with the base model/general chat
# @app.post("/general_chat")  # General model chat
# async def general_chat_with_ai(request: QueryRequest):
#     try:
#         input_text = f"User: {request.query}\nAssistant:"
#         inputs = general_tokenizer(input_text, return_tensors="pt").to(general_model.device)

#         output = general_model.generate(
#             **inputs,
#             max_length=200,
#             temperature=0.7,
#             top_p=0.9,
#             eos_token_id=general_tokenizer.eos_token_id,
#             repetition_penalty=1.2
#         )
#         response = general_tokenizer.decode(output[0], skip_special_tokens=True).strip()
#         response = response.split("Assistant:")[-1].strip()  # Extract relevant response
#         return {"response": response}
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=str(e))

# Expose the FastAPI app via ngrok
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")

# Run Uvicorn in the background
uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `JANI-CHATBOT` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `JANI-

Working General chat and Fine tuned chat implementation-

In [None]:


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import nest_asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import re

nest_asyncio.apply()
app = FastAPI()

# Enable CORS
origins = ["http://localhost:5173","http://localhost:5174"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ✅ Load Fine-Tuned Model Correctly Shreyashs12345/JANI_AI_CHATBOT_2
fine_tuned_model_name = "Shreyashs12345/JANI_AI_CHATBOT_2"
finetuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
finetuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
finetuned_model.to(device)

# ✅ Load Base Model (General Chat)
base_model_name = "microsoft/phi-1_5"
general_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
general_model = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)

class QueryRequest(BaseModel):
    query: str

# ✅ Fine-Tuned Model Chat Endpoint
@app.post("/ask")
async def chat_with_finetuned_ai(request: QueryRequest):
    try:
        input_text = request.query
        inputs = finetuned_tokenizer(input_text, return_tensors="pt").to(device)

        output = finetuned_model.generate(
            **inputs,
            max_length=100,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=finetuned_tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

        response = finetuned_tokenizer.decode(output[0], skip_special_tokens=True).strip()
        response = response.replace(input_text, "").strip()
        response = response.replace("Output:", "").strip()

        match = re.search(r"Output:\s*(.+)", response)
        if match:
            response = match.group(1).strip()

        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# ✅ General Model Chat Endpoint
@app.post("/general_chat")
async def chat_with_general_ai(request: QueryRequest):
    try:
        input_text = f"User: {request.query}\nAssistant:"
        inputs = general_tokenizer(input_text, return_tensors="pt").to(device)

        output = general_model.generate(
            **inputs,
            max_length=200,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=general_tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

        response = general_tokenizer.decode(output[0], skip_special_tokens=True).strip()
        response = response.split("Assistant:")[-1].strip()

        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# ✅ Expose API via ngrok
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")

# ✅ Run Uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    se

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 48.12 MiB is free. Process 5487 has 14.69 GiB memory in use. Of the allocated memory 14.59 GiB is allocated by PyTorch, and 807.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)