In [None]:
!pip install salesforce-lavis --no-index --find-links=file:///kaggle/input/lavis-pip/

In [2]:
!pip uninstall -y salesforce-lavis

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Found existing installation: salesforce-lavis 1.0.0
Uninstalling salesforce-lavis-1.0.0:
  Successfully uninstalled salesforce-lavis-1.0.0
[0m

In [None]:
!pip install salesforce-lavis --no-index --find-links=file:///kaggle/input/lavis-mod-wheel/salesforce_lavis-1.0.0.dev1-py3-none-any.whl

In [4]:
import os
import gc
import cv2
import sys
import torch

import numpy as np
import torch.nn as nn
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from PIL import Image
from lavis.models import load_model, load_preprocess, load_model_and_preprocess
from lavis.processors import load_processor
from lavis.models.blip2_models.blip2_opt import Blip2OPT
from typing import Dict
from sklearn.metrics.pairwise import cosine_similarity 
from pathlib import Path
from accelerate import init_empty_weights

sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models

In [5]:
# these helper functions are based on the following repository. 
# https://github.com/FrancescoSaverioZuppichini/Loading-huge-PyTorch-models-with-linear-memory-consumption/blob/main/README.md
def get_keys_to_submodule(model: nn.Module) -> Dict[str, nn.Module]:
    keys_to_submodule = {}
    for submodule_name, submodule in model.named_modules():
        for param_name, param in submodule.named_parameters():
            splitted_param_name = param_name.split('.')
            is_leaf_param = len(splitted_param_name) == 1
            if is_leaf_param:
                if submodule_name != '':
                    key = f"{submodule_name}.{param_name}"
                else:
                    key = param_name
                keys_to_submodule[key] = submodule                
    return keys_to_submodule


def load_state_dict_with_low_memory(model: nn.Module, state_dict: Dict[str, torch.Tensor]):
    model.to(torch.device("meta"))
    keys_to_submodule = get_keys_to_submodule(model)
    for key, submodule in keys_to_submodule.items():
        val = state_dict.get(key)
        
        if val is not None:
            param_name = key.split('.')[-1]
            param_dtype = getattr(submodule, param_name).dtype
            val = val.to(param_dtype)
            new_val = torch.nn.Parameter(val, requires_grad=False)
            setattr(submodule, param_name, new_val)

In [6]:
comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
with init_empty_weights():
    my_model = Blip2OPT(opt_model="facebook/opt-2.7b")

In [8]:
class DictWrapper:
    def __init__(self, d):
        self.dict = d
    
    def __getattr__(self, name):
        return self.dict[name]

    def get(self, name, default_val=None):
        return self.dict.get(name, default_val)

dict_tr = {
    "name": "blip_image_train",
    "image_size": 224
}
dict_ev = {
    "name": "blip_image_eval",
    "image_size": 224
}
dict_t = {
    "name": "blip_caption"
}
config = {
    "vis_processor":{
        "train":DictWrapper(dict_tr),
        "eval":DictWrapper(dict_ev),
    },
    "text_processor":{
        "train":DictWrapper(dict_t),
        "eval":DictWrapper(dict_t)
    }
}
vis_processors = load_preprocess(config)[0]

In [None]:
load_state_dict_with_low_memory(my_model, torch.load("/kaggle/input/blip2-pretrained-opt27b-sdpth/blip2_pretrained_opt2.7b_sd.pth"))
my_model.eval()
gc.collect()

In [10]:
import pandas as pd
import os

data_path = '/kaggle/input/sample-data/sample.csv'
data_df = pd.read_excel(data_path)

In [16]:
folder_path = '/kaggle/input/sample-data/media'

In [None]:
pred_prompt_list = []
i=0
for ids in data_df['id']:
    if i%100==0:
        print(f"processed {i}th example")
    image_num = str(ids-1)+'.jpg'
    image_path = os.path.join(folder_path, image_num)
    image = Image.open(image_path).convert('RGB')
    image = vis_processors["eval"](image).unsqueeze(0).to(device)
    pred_prompt = my_model.generate({"image": image}, num_beams=3)
    pred_prompt_list.append(pred_prompt[0])
    i+=1

In [23]:
data = pd.DataFrame()
data['caption'] = pred_prompt_list
data['id'] = data_df['id']
data.head()

Unnamed: 0,caption,id
0,a large concrete tower in the middle of a city,6
1,a woman sitting at a desk in an office,8
2,a man in a suit and tie sits in front of a tel...,13
3,two people riding bicycles on a dirt road,14
4,a firefighter is walking on the runway at an a...,17


In [25]:
data.to_csv("/kaggle/working/data.csv",index = False)