In [12]:
import torch
import torch.nn.functional as F
import os
import yaml
from PIL import Image
from lavis.models.multimodal_models.modelmultimodal import Multimodal
from lavis.common.config import Config
from lavis.processors.clip_processors import ClipImageEvalProcessor
import numpy as np

In [2]:
config_path = "lavis/configs/models/clip_vit_large14_336.yaml"
with open(config_path, "r") as f:
    config_dict = yaml.safe_load(f)

print(config_dict)

{'model': {'arch': 'clip', 'model_type': 'ViT-L-14-336', 'pretrained': 'openai'}, 'preprocess': {'vis_processor': {'eval': {'name': 'clip_image_eval', 'image_size': 336}}}}


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Multimodal(
    embed_dim=768,
    vision_cfg={
        "image_size": 336,
        "patch_size": 14,
        "width": 1024,
        "layers": 24
    },
    text_cfg={
        "context_length": 77,
        "vocab_size": 49408,
        "width": 768,
        "layers": 12,
        "heads": 12
    },
    quick_gelu=True,
    add_cls_token=True,
)
model.to(device)
print("")




In [4]:
checkpoint_path = "../../ArtpediaClassif/20250410100656/checkpoint_best.pth"
state_dict = torch.load(checkpoint_path, map_location=device)
missing, unexpected = model.load_state_dict(state_dict["model"], strict=False)

model.eval()
print("state_dict", state_dict.keys())
print("missing", missing)
print("unexpect", unexpected)

state_dict dict_keys(['model', 'optimizer', 'config', 'scaler', 'epoch'])
missing []
unexpect []


In [19]:
image_path = "../../model/imgs/maria.jpg"
image = ClipImageEvalProcessor(image_size=336)(Image.open(image_path)).unsqueeze(0).to(device)

text_input = ["This a picture of a woman which old a baby in her arms. Around her there is four people"]

samples = {
    "image": image,
    "text_input": text_input,
    "label": torch.tensor([1]).to(device)  # 1 visuel, 0 contextuel
}

In [20]:
with torch.no_grad():
    output = model.predict(samples)
token=output["token"].squeeze()
prediction = output["predictions"].squeeze()[token!=0][1:-1].tolist()
words = model.detokenizer(token)

for word, pred in zip(words, prediction):
    print(word,"->", pred)

print("")
print("Sentence ->", round(np.mean(prediction)))

this -> 0
a -> 0
picture -> 0
of -> 0
a -> 0
woman -> 1
which -> 1
old -> 0
a -> 0
baby -> 1
in -> 1
her -> 1
arms -> 1
. -> 1
around -> 1
her -> 1
there -> 1
is -> 1
four -> 0
people -> 1

Sentence -> 1
