## Predict API using Flask and finetuned Unsloth llama3.2B Model 🖥️

#### Dependencies 

In [None]:
# install dependencies for `LLM model`
!pip install -U bitsandbytes
!pip install transformers
!pip install torch

# install dependencies for `Flask` 
!pip install flask_cors
!pip install pyngrok
!pip install flask
!pip install request
!pip install jsonify
!pip install threading

#### `Flask`

In [None]:
#imports
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from pyngrok import ngrok
import torch
from datetime import datetime

In [None]:
NGROK_AUTH_KEY = "" # replace with valid NGROK auth key

In [None]:
# Flask App Initialization
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Ngrok setup
ngrok.set_auth_token(NGROK_AUTH_KEY)

In [None]:
# Load Model and Tokenizer
model_path = "G-210/code_optimization"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to("cuda")

In [None]:
user_context = []

def process_query(query):
    """
    Process the query by appending it to the user context.
    Generates a new result based on the accumulated context.
    """
    # Get the current date
    current_date = datetime.now().strftime("%Y-%m-%d")

    # Append the new query and the current date to the user context
    user_context.append(f"{current_date}\nUser: {query}")
    full_context = "\n".join(user_context)

    # Prepare inputs for the model
    messages = [
        {"role": "user", "content": full_context},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output
    output_tokens = model.generate(
        input_ids=inputs,
        max_new_tokens=128000,  # Adjust token length as needed
        use_cache=True,
        temperature=0.8,
        min_p=0.1,
    )

    # Decode the output tokens
    raw_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    # Filter out unwanted phrases
    unwanted_phrases = [
        "system"
        "Cutting Knowledge Date:",  # Add any phrases to exclude
        "Today Date:",
        "December 2023",
        "26 July 2024",
        "user"
    ]
    for phrase in unwanted_phrases:
        raw_output = raw_output.replace(phrase, "").strip()

    # Append only the user's query to the context to avoid repetition
    return raw_output

In [None]:
@app.route('/predict', methods=['POST'])
def predict():
    """
    Endpoint to handle predictions based on user input.
    """
    data = request.get_json()
    user_input = data.get("i+nput", "")

    if not user_input:
        return jsonify({"error": "No input provided"}), 400

    try:
        # Generate a response using the model
        response = process_query(user_input)
        return jsonify({"prediction": response})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

In [None]:
# Start the Ngrok tunnel and Flask app
public_url = ngrok.connect(5000)
print(f"Ngrok tunnel is running at {public_url}")
app.run(host="0.0.0.0", port=5000, threaded=True)