In [None]:
!pip install transformers accelerate bitsandbytes pandas Pillow tqdm scipy safetensors

In [None]:
import pandas as pd
import torch
from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = InstructBlipForConditionalGeneration.from_pretrained(
                "Salesforce/instructblip-vicuna-7b",
                torch_dtype=torch.float16,
                load_in_8bit=True,
                device_map="auto")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", 
                                                  use_fast=True) #use 13b for error response


In [None]:
# Load CSV
"""
import gc
import warnings
warnings.filterwarnings("ignore", message="The `language_model` is not in the `hf_device_map`")

df = pd.read_csv('/kaggle/input/base-caption-lack/base_caption.csv')

# Config paths
IMAGE_DIR = "/kaggle/input/track-1-private-set/Track 1 - Private Set/matched_images/"  # Adjust path
IMAGE_EXT = ".jpg"  # or ".png", ".jpeg"

empty_caption_mask = df['caption'].isna() | (df['caption'] == '')
empty_indices = df[empty_caption_mask].index

# Process images

results = []
for idx in empty_indices:
    try:
        # Load image
        image_path = f"{IMAGE_DIR}{df.loc[idx, 'matched_image_id']}{IMAGE_EXT}"
        image = Image.open(image_path)
        
        # Generate caption
        prompt = "Describe this image in detail. Focus on the visible people, objects, setting, activities, lighting, atmosphere and any notable elements that would be important for news reporting. Focus on factual details and spatial relationships to create a rich visual narrative."
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, 
                                     max_length = 350, 
                                     min_length = 50, 
                                     length_penalty=1.2,
                                     repetition_penalty=1.12,
                                     num_beams=5,
                                     pad_token_id=processor.tokenizer.eos_token_id,
                                     early_stopping=True)
        
        caption_full = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        if prompt in caption_full:
            caption = caption_full.split(prompt)[1].strip()
        else:
            caption = caption_full
        del inputs, outputs
        torch.cuda.empty_cache()
        gc.collect()
        df.loc[idx, 'caption'] = caption
        print(f"Processed {df.loc[idx, 'query_id']}")
        
    except Exception as e:
        print(f"Error processing {df.loc[idx, 'query_id']}: {e}")

df.to_csv("updated_caption.csv", index=False)
"""

In [None]:
# Load CSV
import gc
import warnings
warnings.filterwarnings("ignore", message="The `language_model` is not in the `hf_device_map`")

df = pd.read_csv('/kaggle/input/matched-images-real/link_images.csv') #Adjust your path, link_images.csv is the file csv that contains the query_id and the matched image in database

# Config paths
IMAGE_DIR = "/kaggle/input/track-1-private-set/Track 1 - Private Set/matched_images/"  # Adjust path (image database folder)
IMAGE_EXT = ".jpg"  # or ".png", ".jpeg"

# Choose which half to process
PROCESS_HALF = "first"  # "first" or "second"

if PROCESS_HALF == "first":
    df_subset = df[:len(df)//2]
    print(f"Processing first half: {len(df_subset)} rows")
elif PROCESS_HALF == "second": 
    df_subset = df[len(df)//2:]
    print(f"Processing second half: {len(df_subset)} rows")

# Process images
results = []
for _, row in df_subset.head(2).iterrows():
    try:
        # Load image
        image_path = f"{IMAGE_DIR}{row['matched_image_id']}{IMAGE_EXT}"
        image = Image.open(image_path)
        
        # Generate caption
        prompt = "Describe this image in detail. Focus on the visible people, objects, setting, activities, lighting, atmosphere and any notable elements that would be important for news reporting. Focus on factual details and spatial relationships to create a rich visual narrative."
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, 
                                     max_length = 350, 
                                     min_length = 50, 
                                     length_penalty=1.0,
                                     repetition_penalty=1.1,
                                     num_beams=5,
                                     pad_token_id=processor.tokenizer.eos_token_id,
                                     early_stopping=True)
        
        caption_full = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        caption_full = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        if prompt in caption_full:
            caption = caption_full.split(prompt)[1].strip()
        else:
            caption = caption_full
        del inputs, outputs
        torch.cuda.empty_cache()
        gc.collect()
        results.append({
            'query_id': row['query_id'],
            'matched_image_id': row['matched_image_id'],
            'caption': caption
        })
        
        print(f"Processed {row['query_id']}")
        
    except Exception as e:
        print(f"Error processing {row['query_id']}: {e}")

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv('captions.csv', index=False)
print(f"Generated {len(results)} captions")
