In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is enabled!")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found. Please check settings.")


In [None]:
!pip install transformers accelerate


In [None]:
!pip install -U accelerate


In [None]:
!pip install -U bitsandbytes


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch


# Restrict PyTorch to a single GPU (optional)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0 only

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type="nf4",  # Type of quantization
    bnb_4bit_use_double_quant=True,  # double quantization
)

# Model and tokenizer setup
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token="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")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, 
    use_auth_token="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",
    device_map="auto", 
)

# Ensure the model uses GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the specified device

print("Model loaded successfully with 4-bit quantization")



In [None]:
# Function to create a prompt
def create_prompt(data):
    prompt = f"""
    You are a caption generation model. Based on the details provided below, generate a concise and accurate caption for the image. Only describe what is mentioned in the details. Avoid any additional or imagined information.

    Details:
    - Detected objects: {', '.join(data['Detected objects'])}
    - Scene context: {', '.join(data['Scene context'])}
    - OCR text: {data['OCR text'] if data['OCR text'] else 'None'}

    Caption:
    """
    return prompt

# Example inputs for testing
example_inputs = [
    {
        "Detected objects": ["dog", "ball"],
        "Scene context": ["outdoor", "playing"],
        "OCR text": ""
    },
    {
        "Detected objects": ["car", "person"],
        "Scene context": ["urban", "traffic"],
        "OCR text": "Speed Limit 60"
    },
    {
        "Detected objects": ["cake", "table"],
        "Scene context": ["indoor", "celebration"],
        "OCR text": "Happy Birthday"
    }
]

# Function to process inputs and generate captions
def test_pipeline(inputs):
    results = []
    for idx, data in enumerate(inputs):
        # Create prompt
        prompt = create_prompt(data)
        print(f"Prompt for Example {idx + 1}:\n{prompt}\n{'-'*50}")  # Debug prompt

        # Tokenize prompt and move to the same device as the model
        input_ids = tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate caption
        output_ids = model.generate(
            input_ids["input_ids"],
            max_new_tokens=100,  # Ensure enough tokens are generated
            temperature=0.7
        )
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        # Extract the actual caption
        caption_start = generated_text.find("Caption:") + len("Caption:")
        caption = generated_text[caption_start:].strip() if caption_start > 0 else generated_text.strip()
        
        # Save results
        results.append({"Input": data, "Caption": caption})
        
        # Print results for quick inspection
        print(f"Example {idx + 1}:\nInput: {data}\nGenerated Caption: {caption}\n{'-'*50}")
    return results

# Run the test
results = test_pipeline(example_inputs)

# Save results to a CSV for analysis
df = pd.DataFrame(results)
df.to_csv("generated_captions.csv", index=False)
print("Results saved to 'generated_captions.csv'")