# Load Model and Tokenizer

In [1]:
import os
import sys
import torch
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel # Crucial for loading LoRA adapters

# --- Global Model Loading (occurs once when the app starts) ---
tokenizer = None
model = None
# This path must match the 'output_dir' from your training script
MODEL_SAVE_PATH = "./flan_t5_fine_tuned_model"
# This is the name of the original base model you used for fine-tuning
BASE_MODEL_NAME = "google/flan-t5-base"

try:
    print("Loading tokenizer and model...")
    # 1. Load the tokenizer that was saved during training (it's the same as the base model's tokenizer)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH) # Load saved tokenizer for consistency

    # 2. Load the base FLAN-T5 model
    base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)

    # 3. Load the LoRA adapters and apply them to the base model
    # The 'MODEL_SAVE_PATH' directory contains the LoRA adapter weights (e.g., 'adapter_model.bin' or 'adapter_model.safetensors')
    if os.path.exists(MODEL_SAVE_PATH) and os.path.isdir(MODEL_SAVE_PATH):
        model = PeftModel.from_pretrained(base_model, MODEL_SAVE_PATH)
        print(f"LoRA adapters loaded from {MODEL_SAVE_PATH}.")
        # Optional: If you want to merge the LoRA adapters into the base model's weights
        # for potentially slightly faster inference and a self-contained model,
        # you can uncomment the line below. This makes the model larger in memory.
        # model = model.merge_and_unload()
        # print("LoRA adapters merged into the base model.")
    else:
        print(f"Warning: LoRA adapters not found at '{MODEL_SAVE_PATH}'. "
              "Ensure training script was run and directory exists. "
              "Falling back to original base model (will not have fine-tuned behavior).", file=sys.stderr)
        model = base_model # Fallback if adapters aren't found. In production, you might exit.

    model.eval() # Set the model to evaluation mode (crucial for inference)

    # Move the model to GPU if available for faster predictions
    if torch.cuda.is_available():
        model.to("cuda")
        print("Model moved to GPU.")
    else:
        print("Running model on CPU.")

    print("Model and tokenizer loaded successfully!")

except Exception as e:
    print(f"Error loading model or tokenizer: {e}", file=sys.stderr)
    sys.exit(1) # Exit if model loading fails, as the app cannot function without it


  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer and model...
LoRA adapters loaded from ./flan_t5_fine_tuned_model.
Running model on CPU.
Model and tokenizer loaded successfully!


# Run Flask Server

In [3]:
# --- Flask Routes ---
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get data from the POST request
        data = request.get_json()

        # Input validation: Ensure 'text' key exists and is a string
        if not data or 'text' not in data or not isinstance(data['text'], str):
            return jsonify({'error': "Invalid request. Please provide a JSON body with a 'text' field (string)."}), 400

        input_text = data['text']

        # --- Preprocess input text for your model ---
        # IMPORTANT: Use the exact same instruction prefix as during training!
        input_for_model = f"Identify intent and filters: {input_text}"
        inputs = tokenizer(
            input_for_model,
            return_tensors="pt",
            truncation=True,
            padding="max_length", # Ensure consistent input length
            max_length=128        # Match max_length used during training preprocessing
        )

        # Move input tensors to the same device as the model (GPU/CPU)
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # --- Make prediction ---
        with torch.no_grad(): # Disable gradient calculation for inference (saves memory and speeds up)
            # Use the .generate() method for Seq2Seq models like T5
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=128, # Max length of the generated output (adjust as needed)
                num_beams=5,        # Use beam search for potentially better quality (can be adjusted)
                early_stopping=True # Stop generation if all beam hypotheses have finished
            )

        # Decode the generated output back to a human-readable string
        prediction_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # --- Return the prediction as a JSON response ---
        print(prediction_output)
        return jsonify({'prediction': prediction_output})

    except KeyError as e:
        # Specific error for missing 'text' key in the JSON
        return jsonify({'error': f"Missing expected data field: {e}. Please provide a JSON body with a 'text' key."}), 400
    except Exception as e:
        # Catch any other unexpected errors during processing or prediction
        print(f"An error occurred during prediction: {e}", file=sys.stderr) # Log the full error on the server side
        return jsonify({'error': "An internal server error occurred during prediction. Please try again later."}), 500

if __name__ == '__main__':
    # Run the Flask app on a specified host and port.
    # '0.0.0.0' makes it accessible from other devices on your local network.
    # Set debug=False for production environments.
    app.run(host='192.168.1.11', port=5000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://192.168.1.11:5000
Press CTRL+C to quit
