In [2]:
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
import numpy as np

class ImageAnalyzer:
    def __init__(self):
        # 轻量级模型初始化 (总大小 < 1.5GB)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # 图像描述模型 (BLIP轻量版)
        self.caption_processor = BlipProcessor.from_pretrained("./models/blip-image-captioning-base")
        self.caption_model = BlipForConditionalGeneration.from_pretrained("./models/blip-image-captioning-base").to(self.device)
        
        # 语义关键词提取模型
        self.kw_model = KeyBERT("paraphrase-MiniLM-L6-v2")
        
        # CLIP模型用于多标签相似度计算
        self.clip_model = SentenceTransformer("clip-ViT-B-32", device=self.device)
        
        # 预定义多维度标签库
        self.style_tags = [
            # 艺术风格
            "anime style", "oil painting", "watercolor", "sketch", "cyberpunk", 
            "realistic photo", "impressionism", "3D rendering", "pixel art",
            # 摄影参数
            "DSLR", "wide angle lens", "shallow depth of field", "bokeh effect", 
            "film grain", "overexposed", "motion blur", "CCD camera", "Nikon D850",
            # 画面质量
            "high detail", "sharp focus", "blurry background", "noisy image", 
            "high contrast", "vibrant colors", "pastel colors"
        ]
        
        self.emotion_tags = [
            # 情感色彩
            "happy", "melancholic", "serene", "chaotic", "mysterious",
            "romantic", "lonely", "energetic", "peaceful", "dramatic",
            # 感官感受
            "warm tone", "cold palette", "soft lighting", "harsh shadows",
            "dreamy atmosphere", "nostalgic vibe", "futuristic feel"
        ]
        
        self.ai_prompt_tags = [
            # AI生成常见元素
            "trending on ArtStation", "Unreal Engine", "Octane render", 
            "Studio lighting", "intricate details", "symmetrical composition",
            "centered composition", "dramatic angle", "cinematic lighting",
            "photorealistic", "concept art", "digital painting"
        ]
        self.all_tags = self.style_tags + self.emotion_tags + self.ai_prompt_tags
        self.tag_embeddings = self.clip_model.encode(self.all_tags, convert_to_tensor=True)

    def analyze_content(self, image_path):
        """分析图像主要内容并提取关键词权重"""
        image = Image.open(image_path).convert("RGB")
        
        # 生成图像描述
        inputs = self.caption_processor(image, return_tensors="pt").to(self.device)
        caption_ids = self.caption_model.generate(**inputs, max_length=50)
        caption = self.caption_processor.decode(caption_ids[0], skip_special_tokens=True)
        
        # 提取关键词权重
        keywords = self.kw_model.extract_keywords(
            caption, 
            keyphrase_ngram_range=(1, 2),
            stop_words="english",
            top_n=5,
            diversity=0.5
        )
        keyword_dict = {k: float(v) for k, v in keywords}
        
        # 归一化权重
        total = sum(keyword_dict.values())
        normalized_dict = {k: v/total for k, v in keyword_dict.items()}
        
        return caption, normalized_dict

    def analyze_style(self, image_path):
        """计算图像与预定义标签的CLIP相似度"""
        image = Image.open(image_path).convert("RGB")
        image_embedding = self.clip_model.encode(image, convert_to_tensor=True)
        
        # 计算余弦相似度
        cos_scores = util.cos_sim(image_embedding, self.tag_embeddings)[0]
        style_dict = {tag: float(score) for tag, score in zip(self.all_tags, cos_scores)}
        
        # 按相似度排序并过滤低分值标签
        filtered_dict = {k: v for k, v in style_dict.items() if v > 0.2}
        return dict(sorted(filtered_dict.items(), key=lambda x: x[1], reverse=True)[:10])

    def analyze_image(self, image_path):
        """整合分析结果"""
        caption, content_dict = self.analyze_content(image_path)
        style_dict = self.analyze_style(image_path)
        
        return {
            "description": caption,
            "content_analysis": content_dict,
            "style_analysis": style_dict
        }

if __name__ == "__main__":
    analyzer = ImageAnalyzer()
    
    # 示例分析（替换为你的图片路径）
    result = analyzer.analyze_image("astronaut_rides_horse.png")
    print("图像描述:", result["description"])
    print("\n内容分析:")
    for k, v in result["content_analysis"].items():
        print(f"{k}: {v:.2f}")
    
    print("\n风格与情感分析:")
    for k, v in result["style_analysis"].items():
        print(f"{k}: {v:.2f}")

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc136468d709f1802a8/57fe1319d988e37d3da2bddac58a31b3412c657ed0379a60260053a2f0095539?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250603%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250603T085345Z&X-Amz-Expires=3600&X-Amz-Signature=dcd4493bd5ded8021c80afc6b14aab6b1682a53e0c40c8cecec2646d94c700de&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&x-id=GetObject&Expires=1748944425&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODk0NDQyNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMTM2NDY4ZDcwOWYxODAyYTgvNTdmZTEzMTlkOTg4ZTM3ZDNkYTJiZGRhYzU4YTMxYjM0MTJjNjU3ZWQwMzc5YTYwMjYwMDUzYTJmMDA5NTUzOSoifV19&Signature=Cn9s

图像描述: a man in a space suit riding a horse

内容分析:
riding horse: 0.23
suit riding: 0.22
space suit: 0.21
man space: 0.17
horse: 0.16

风格与情感分析:
trending on ArtStation: 0.24
concept art: 0.23
Unreal Engine: 0.23
futuristic feel: 0.23
centered composition: 0.22
3D rendering: 0.21
realistic photo: 0.21
dreamy atmosphere: 0.21
Octane render: 0.21
digital painting: 0.21
