In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 🔹 Set Your Hugging Face Token Here
HUGGINGFACE_TOKEN = ""  # Replace with your actual token

# 🔹 Load Mistral model (Make sure you have access!)
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

sentiment_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token=HUGGINGFACE_TOKEN  # ✅ Correct way to authenticate
).to("cuda" if torch.cuda.is_available() else "cpu")

sentiment_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_auth_token=HUGGINGFACE_TOKEN
)

In [None]:
import torch
from PIL import Image
import gradio as gr
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM,
    CLIPProcessor, CLIPModel
)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Emotion labels
emotion_labels = [
    "happy", "sad", "angry", "surprised", "neutral",
    "fearful", "disgusted", "excited", "peaceful"
]

# Load models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", trust_remote_code=True)
mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    torch_dtype=torch.float16
)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

analyzer = SentimentIntensityAnalyzer()

# Emotion Detection
def detect_emotion(image: Image.Image):
    inputs = clip_processor(
        text=[f"This image makes me feel {emotion}." for emotion in emotion_labels],
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits = outputs.logits_per_image
        probs = logits.softmax(dim=1)

    best_emotion = emotion_labels[probs.argmax()]
    confidence = probs[0][probs.argmax()].item()
    return best_emotion, confidence

# Generate Comment
def generate_comment(image: Image.Image, emotion_label: str, mode: str = "short"):
    blip_inputs = blip_processor(image, return_tensors="pt").to(device)
    caption_ids = blip_model.generate(**blip_inputs)
    caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

    if mode == "short":
        prompt = f"""<s>[INST] Here's a description of an image: "{caption}". The person in the image is feeling "{emotion_label}". Write a *one-line*, expressive, emotional comment that reflects this vibe. Be human-like, creative, and fun. [/INST]"""
        max_tokens = 40
    else:
        prompt = f"""<s>[INST] Here's a description of an image: "{caption}". The person in the image is feeling "{emotion_label}". Write a short *paragraph* that's expressive, vivid, and emotional. Capture the atmosphere and the feeling. Be poetic and engaging. [/INST]"""
        max_tokens = 100

    inputs = mistral_tokenizer(prompt, return_tensors="pt").to(mistral_model.device)
    outputs = mistral_model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.9,
        top_p=0.9,
        return_dict_in_generate=True,
        output_scores=True
    )

    response = mistral_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    comment = response.split('[/INST]')[-1].strip()

    # Estimate Perplexity
    probs = torch.stack(outputs.scores).softmax(dim=-1)
    gen_ids = outputs.sequences[0][inputs.input_ids.shape[1]:]
    gen_probs = [probs[i][0][token.item()].item() for i, token in enumerate(gen_ids)]
    perplexity = torch.exp(-torch.log(torch.tensor(gen_probs)).mean()).item()

    return caption, comment, perplexity

# Sentiment
def evaluate_comment(comment):
    sentiment = analyzer.polarity_scores(comment)
    valence = sentiment["compound"]
    sentiment_type = (
        "positive" if valence > 0.05 else
        "negative" if valence < -0.05 else
        "neutral"
    )
    return sentiment_type, valence

# Main Gradio pipeline
def process(image, mode, emotion_choice):
    detected_emotion, confidence = detect_emotion(image)
    emotion_used = emotion_choice if emotion_choice != "auto" else detected_emotion
    caption, comment, perplexity = generate_comment(image, emotion_used, mode)
    sentiment_type, valence = evaluate_comment(comment)

    return (
        f"🖼 Caption: {caption}",
        f"🎭 Detected Emotion: {detected_emotion} (confidence: {confidence:.2f})",
        f"🧠 Emotion Used: {emotion_used}",
        f"💬 Emotional Comment:\n> {comment}",
        f"📊 Sentiment: {sentiment_type} (valence: {valence:.2f})",
        f"📉 Approx. Perplexity: {perplexity:.2f}"
    )

# Gradio Interface
demo = gr.Interface(
    fn=process,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Radio(choices=["short", "paragraph"], value="short", label="Comment Mode"),
        gr.Dropdown(["auto"] + emotion_labels, value="auto", label="Emotion (or auto-detect)")
    ],
    outputs=[
        gr.Textbox(label="Image Caption"),
        gr.Textbox(label="Detected Emotion"),
        gr.Textbox(label="Emotion Used"),
        gr.Textbox(label="Generated Comment"),
        gr.Textbox(label="Sentiment"),
        gr.Textbox(label="Perplexity"),
    ],
    title="🧠 Emotional Comment Generator",
    description="Upload an image, and this app generates an emotional comment using Mistral + BLIP + CLIP. Detects emotion, sentiment, and perplexity!"
)

demo.launch()