In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install flask transformers peft pyngrok

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)
from peft import PeftModel
from flask import Flask, request, jsonify
import threading

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "/content/drive/MyDrive/eye_chatbot/llama_2_finetuned"

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Set up pipeline
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=150,
    num_beams=5,
    top_p=0.9,
    top_k=50,
    temperature=0.4
)

In [None]:
app = Flask(__name__)

@app.route("/generate", methods=["POST"])
def generate():
    data = request.json
    prompt = data["prompt"]
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    response = result[0]['generated_text'].split('[/INST]')[1].split('1')[0].strip()
    return jsonify({"response": response})

def run_flask():
    app.run(host='0.0.0.0', port=5000)

threading.Thread(target=run_flask).start()


In [None]:
!ngrok authtoken #your --ngrok token


In [None]:
!ngrok http --domain='your-domain-name' 5000