In [None]:
import kagglehub
import os
import pandas as pd
import glob
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor
import torch
from transformers import ViTForImageClassification
from transformers import ViTModel
from transformers import BlipProcessor, BlipForConditionalGeneration
import concurrent.futures
import csv
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import pipeline, MarianMTModel, MarianTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
path = kagglehub.dataset_download("hsankesara/flickr-image-dataset")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/flickr-image-dataset


In [None]:
main_dir = '/kaggle/input/flickr-image-dataset'

In [None]:
flickr30k_images_folder = os.path.join(main_dir, 'flickr30k_images')
image_files = glob.glob(f'{flickr30k_images_folder}/*')
print(image_files[:10])

['/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images', '/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv']


In [None]:
csv_path = os.path.join(main_dir, 'flickr30k_images', 'results.csv')
if not os.path.exists(csv_path):
    print(f"File not found: {csv_path}")
else:
    df = pd.read_csv(csv_path, delimiter='|')
    print(df.head())

       image_name  comment_number  \
0  1000092795.jpg               0   
1  1000092795.jpg               1   
2  1000092795.jpg               2   
3  1000092795.jpg               3   
4  1000092795.jpg               4   

                                             comment  
0   Two young guys with shaggy hair look at their...  
1   Two young , White males are outside near many...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  


In [None]:
csv_path = os.path.join(main_dir, 'flickr30k_images', 'results.csv')
df = pd.read_csv(csv_path, delimiter='|')
print(df.head())

       image_name  comment_number  \
0  1000092795.jpg               0   
1  1000092795.jpg               1   
2  1000092795.jpg               2   
3  1000092795.jpg               3   
4  1000092795.jpg               4   

                                             comment  
0   Two young guys with shaggy hair look at their...  
1   Two young , White males are outside near many...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  


In [None]:
image_folder = "/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images"
image_files = os.listdir(image_folder)
print(image_files[:10])

['2715746315.jpg', '3463034205.jpg', '268704620.jpg', '2673564214.jpg', '7535037918.jpg', '4912369161.jpg', '4828071602.jpg', '6802728196.jpg', '3346289227.jpg', '3217056901.jpg']


In [None]:
image_folder = "/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images"

image_files = os.listdir(image_folder)[:5]
fig, axes = plt.subplots(1, len(image_files), figsize=(15, 5))

for i, img_file in enumerate(image_files):
    img_path = os.path.join(image_folder, img_file)
    img = Image.open(img_path)
    axes[i].imshow(img)
    axes[i].axis("off")
    axes[i].set_title(img_file)

plt.show()

In [None]:
image_size = 224

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def preprocess_image(image_path):
    img = Image.open(image_path).convert("RGB")
    return transform(img)

example_img_path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg'
image_tensor = preprocess_image(example_img_path)
print(image_tensor.shape)

torch.Size([3, 224, 224])


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer.pad_token = '[PAD]'
caption = "Two young guys with shaggy hair look at their hands while hanging out in the yard."
tokenized_caption = tokenizer(caption, return_tensors="pt", padding=True, truncation=True)
print(tokenized_caption['input_ids'])

tensor([[  101,  2048,  2402,  4364,  2007, 25741,  2606,  2298,  2012,  2037,
          2398,  2096,  5689,  2041,  1999,  1996,  4220,  1012,   102]])


In [None]:
model_name = "google/vit-base-patch16-224-in21k"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model.eval()
image_tensor = image_tensor.unsqueeze(0)

with torch.no_grad():
    outputs = model(image_tensor)

logits = outputs.logits
probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
predicted_class_prob = probabilities[0, predicted_class_idx].item()

print(f"Predicted class index: {predicted_class_idx}")
print(f"Prediction probability: {predicted_class_prob}")


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class index: 1
Prediction probability: 0.5312654376029968


In [None]:
model_name = "google/vit-base-patch16-224-in21k"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

def extract_features(image_path):
    img = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(img, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    features = outputs.logits
    return features

example_img_path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg'
image_features = extract_features(example_img_path)
print(image_features.shape)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 2])


In [None]:
model_name = "google/vit-base-patch16-224-in21k"
model = ViTModel.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

def extract_features(image_path):
    img = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(img, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    features = outputs.last_hidden_state
    return features

example_img_path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg'
image_features = extract_features(example_img_path)
print(image_features.shape)

torch.Size([1, 197, 768])


In [None]:
image_features_cls = image_features[:, 0, :]
print(image_features_cls.shape)

torch.Size([1, 768])


In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_caption(image_features_cls):
    raw_image = Image.open(example_img_path).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt")

    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    return caption

caption = generate_caption(image_features_cls)
print("Generated Caption:", caption)

Generated Caption: a man standing in the grass


In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_caption(image_path):
    raw_image = Image.open(image_path).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    return (image_path, caption)

image_folder = "/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images"
image_files = os.listdir(image_folder)
num_images_to_process = 50
image_caption_pairs = []

def generate_caption_for_image(img_file, image_folder):
    img_path = os.path.join(image_folder, img_file)
    return generate_caption(img_path)

def process_images_in_parallel(image_files, image_folder, num_images_to_process):
    image_files_to_process = image_files[:num_images_to_process]

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(executor.map(generate_caption_for_image, image_files_to_process, [image_folder] * len(image_files_to_process)))

    return results

image_caption_pairs = process_images_in_parallel(image_files, image_folder, num_images_to_process)

for img_path, caption in image_caption_pairs[:5]:
    print(f"Image: {img_path}\nCaption: {caption}\n")

Image: /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/2715746315.jpg
Caption: man wearing a hat

Image: /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/3463034205.jpg
Caption: a skate park

Image: /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/268704620.jpg
Caption: two dogs running in the snow

Image: /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/2673564214.jpg
Caption: a blue and green tent

Image: /kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/7535037918.jpg
Caption: a man with a beard



In [None]:
output_file = 'image_captions.csv'
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Image Path', 'Caption'])
    for img_path, caption in image_caption_pairs:
        writer.writerow([img_path, caption])

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class ImageCaptionDataset(Dataset):
    def __init__(self, image_caption_pairs, transform=None, tokenizer=None):
        self.image_caption_pairs = image_caption_pairs
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.image_caption_pairs)

    def __getitem__(self, idx):
        img_path, caption = self.image_caption_pairs[idx]

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        caption_tokenized = self.tokenizer(caption, padding="max_length", truncation=True, return_tensors="pt")

        return img, caption_tokenized.input_ids.squeeze(0)

dataset = ImageCaptionDataset(image_caption_pairs, transform=transform, tokenizer=tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

for images, captions in dataloader:
    print(f"Images batch shape: {images.shape}")
    print(f"Captions batch shape: {captions.shape}")
    break


Images batch shape: torch.Size([16, 3, 224, 224])
Captions batch shape: torch.Size([16, 512])


In [None]:
dfile = pd.read_csv('/content/image_captions.csv')
dfile.head(5)

Unnamed: 0,Image Path,Caption
0,/kaggle/input/flickr-image-dataset/flickr30k_i...,man wearing a hat
1,/kaggle/input/flickr-image-dataset/flickr30k_i...,a skate park
2,/kaggle/input/flickr-image-dataset/flickr30k_i...,two dogs running in the snow
3,/kaggle/input/flickr-image-dataset/flickr30k_i...,a blue and green tent
4,/kaggle/input/flickr-image-dataset/flickr30k_i...,a man with a beard


In [None]:
!pip install gradio



In [None]:
import gradio as gr

caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

languages = {
    "French": "Helsinki-NLP/opus-mt-en-fr",
    "Spanish": "Helsinki-NLP/opus-mt-en-es",
    "Hindi": "Helsinki-NLP/opus-mt-en-hi",
    "Mandarin": "Helsinki-NLP/opus-mt-en-zh",
}

story_generator = pipeline("text-generation", model="gpt2")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")


def generate_caption(image):
    inputs = caption_processor(image, return_tensors="pt")
    out = caption_model.generate(**inputs)
    caption = caption_processor.decode(out[0], skip_special_tokens=True)
    return caption


def translate_caption(caption, target_lang):
    model_name = languages.get(target_lang)
    if model_name is None:
        return caption

    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    try:
        translated = model.generate(**tokenizer(caption, return_tensors="pt"))
        translated_caption = tokenizer.decode(translated[0], skip_special_tokens=True)
        return translated_caption
    except Exception as e:
        return f"Translation error: {str(e)}"


def multilingual_caption(image, language):
    english_caption = generate_caption(image)

    if language != 'English':
        translated_caption = translate_caption(english_caption, language)
    else:
        translated_caption = english_caption

    return f"English: {english_caption}\n{language.capitalize()}: {translated_caption}"


def generate_story(image):
    caption = generate_caption(image)
    story_prompt = f"Based on the following scene: '{caption}', tell me a detailed, creative, and coherent story with a beginning, middle, and end."
    story = story_generator(story_prompt, max_length=200, do_sample=True)[0]['generated_text']
    return story


def answer_question(image, question):
    caption = generate_caption(image)
    qa_input = {"question": question, "context": caption}
    answer = qa_pipeline(qa_input)
    return answer['answer']


def process_image(img, option, language="English", question=""):
    if option == "Storytelling":
        return generate_story(img)
    elif option == "Multilingual Caption":
        return multilingual_caption(img, language)
    elif option == "Q&A":
        if not question:
            return "Please enter a question."
        return answer_question(img, question)
    else:
        return "Invalid option."


Device set to use cpu
Device set to use cpu


In [None]:
with gr.Blocks() as iface:
    gr.Markdown("## AI-Powered Image Processing")

    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload an Image")

    with gr.Row():
        option_dropdown = gr.Dropdown(
            choices=["Storytelling", "Multilingual Caption", "Q&A"],
            label="Choose an Option"
        )

    with gr.Row():
        language_input = gr.Dropdown(
            choices=["English", "French", "Spanish", "Hindi", "Mandarin"],
            label="Choose a Language for Translation",
            visible=False
        )

        question_input = gr.Textbox(label="Enter a question (For Q&A only)", visible=False)

    output_display = gr.Textbox(label="Output", interactive=False)

    def show_language_box(option):
        return gr.update(visible=(option == "Multilingual Caption"))

    def show_question_box(option):
        return gr.update(visible=(option == "Q&A"))

    option_dropdown.change(show_language_box, inputs=[option_dropdown], outputs=[language_input])
    option_dropdown.change(show_question_box, inputs=[option_dropdown], outputs=[question_input])

    process_button = gr.Button("Process")
    process_button.click(
        process_image,
        inputs=[image_input, option_dropdown, language_input, question_input],
        outputs=output_display
    )

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://be4f74754d43c5df31.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
reference_captions = [
    "A man and a boy are standing before the sky",
    "A girl standing in front of a poster banner"
]

generated_captions = [
    "A young girl looking at the sky staring at a star",
    "A young girl standing before a banner."
]

ref_tokens = [set(ref.lower().split()) for ref in reference_captions]
gen_tokens = [set(gen.lower().split()) for gen in generated_captions]

mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(ref_tokens)
y_pred = mlb.transform(gen_tokens)

precision = precision_score(y_true, y_pred, average='samples')
recall = recall_score(y_true, y_pred, average='samples')
f1 = f1_score(y_true, y_pred, average='samples')

print(f"\nPrecision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Precision: 0.7500
Recall:    0.3542
F1 Score:  0.4808


