In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

class DeviceAdvisorLLM:
    def __init__(self, model_id="google/gemma-3-2b-it", hf_token=""):
        """
        Initialize Gemma with 4-bit quantization for efficiency.
        Replace 'google/gemma-3-2b-it' with 'google/gemma-3-9b-it' if you have >12GB VRAM.
        """
        print(f"Loading LLM: {model_id}...")
        
        # 1. Quantization Config (Reduces VRAM usage by 4x)
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

        # 2. Load Tokenizer & Model
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=hf_token
        )
        print("LLM Loaded successfully.")

    def generate_recommendation(self, device_type, visual_condition, nlp_issues):
        """
        Synthesizes visual and text data to give a final verdict.
        """
        
        # Clean up inputs for better reading
        visual_clean = visual_condition.replace("_", " ").title()
        issues_clean = ", ".join([x.replace("_", " ") for x in nlp_issues]) if nlp_issues else "No specific internal issues reported."

        # 3. Construct the Expert Prompt
        # We use the specific chat template for Gemma
        chat = [
            {
                "role": "user",
                "content": f"""
                You are an Expert Electronics Technician. Analyze the following device data and provide a recommendation.

                --- DEVICE STATUS REPORT ---
                1. Device Type: {device_type}
                2. Visual Inspection Results: {visual_clean}
                3. User Reported Symptoms: {issues_clean}

                --- TASK ---
                Based on these combined facts, provide a structured response:
                1. **Summary**: A 1-sentence diagnosis of the problem.
                2. **Severity**: Low, Medium, or Critical.
                3. **Action**: Choose one (Repair, Sell/Recycle, Keep/Calibrate).
                4. **Reasoning**: Briefly explain why.
                """
            }
        ]

        # 4. Format Prompt
        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        
        # 5. Generate Response
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
        
        outputs = self.model.generate(
            inputs, 
            max_new_tokens=250, # Keep output concise
            temperature=0.7,
            do_sample=True
        )

        # Decode and strip the prompt to get just the answer
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the newly generated part (removing the prompt input)
        # Gemma's template usually ends with "model" or similar, but split logic is safer:
        final_answer = response.split("model\n")[-1].strip() 
        return final_answer