In [None]:
# 📌 1. Install Dependencies
!pip install transformers datasets pillow torch torchvision tqdm -q

# 📌 2. Imports
import os
import random
from tqdm import tqdm
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments

# 📌 3. Download Flickr8k
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!unzip -q Flickr8k_Dataset.zip
!unzip -q Flickr8k_text.zip

# 📌 4. Load Captions
captions = []
with open('Flickr8k_text/Flickr8k.token.txt', 'r') as f:
    for line in f:
        img_caps = line.strip().split('\t')
        img_id, cap = img_caps[0].split('#')[0], img_caps[1]
        captions.append((os.path.join('Flicker8k_Dataset', img_id), cap))

df = pd.DataFrame(captions, columns=['image_path', 'caption'])
df = df.sample(3000, random_state=42).reset_index(drop=True)

# 📌 5. Stylize Captions with NLP flair
EMOJIS = ["✨", "🌟", "💫", "🔥", "💖", "📸", "🌈", "🌸", "😎", "🧡"]
HASHES = ["#InstaVibes", "#Aesthetic", "#Mood", "#PhotoDump", 
          "#Wanderlust", "#ChillScene", "#DailyInspo"]

def add_style(c):
    words = c.split()
    # Keep first few words, maybe shuffle for flair
    snippet = " ".join(random.sample(words, min(len(words), 6)))
    snippet = snippet.capitalize()
    return f"{snippet} {random.choice(EMOJIS)} {random.choice(HASHES)}"

df['caption'] = df['caption'].apply(add_style)
df.to_csv('stylized_captions.csv', index=False)
print("Stylized captions head:\n", df.head())

# 📌 6. Dataset Class
class InstaDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['image_path']
        image = Image.open(path).convert('RGB')
        caption = self.df.iloc[idx]['caption']
        inputs = self.processor(images=image, text=caption, 
                                 padding="max_length", truncation=True, max_length=128,
                                 return_tensors="pt")
        inputs['labels'] = inputs['input_ids']
        return {k: v.squeeze(0) for k, v in inputs.items()}

# 📌 7. Load Model & Processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 📌 8. Prepare DataLoader
df2 = pd.read_csv('stylized_captions.csv')
dataset = InstaDataset(df2, processor)

def collate_fn(batch):
    return {
        'input_ids': torch.stack([b['input_ids']     for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'pixel_values': torch.stack([b['pixel_values']  for b in batch]),
        'labels': torch.stack([b['labels']           for b in batch]),
    }

# 📌 9. Training Setup
training_args = TrainingArguments(
    output_dir="./blip_fancy",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn
)

# 📌 10. Start Training
trainer.train()

# 📌 11. Save Model & Processor
model.save_pretrained("blip_fancy_model")
processor.save_pretrained("blip_fancy_model")

# 📌 12. Inference Test on New Image
from PIL import Image
import random

def make_fancy(c):
    EMOJIS = ["✨", "🌟", "💫", "🔥", "💖"]
    STARTERS = ["Serving looks 😍 —", "Just vibes ✨", "Golden hour glory 🌅"]
    ENDERS = ["#mood", "#wanderlust", "#aesthetic"]
    cap = c[0].upper() + c[1:]
    return f"{random.choice(STARTERS)} {cap} {random.choice(EMOJIS)} {random.choice(ENDERS)}"

test_img = Image.open("your_test_image.jpg").convert('RGB')
inp = processor(images=test_img, return_tensors="pt").to(model.device)
out = model.generate(**inp)
plain = processor.decode(out[0], skip_special_tokens=True)
 fancy = make_fancy(plain)
print("Plain:", plain)
print("Fancy:", fancy)


In [None]:
# install required packages
# pip install transformers datasets torch torchvision accelerate

from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from PIL import Image
import torch
import os

# Load BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Custom dataset: Assuming CSV with columns ['image_path', 'caption']
def load_custom_dataset(csv_path):
    import pandas as pd

    df = pd.read_csv(csv_path)
    dataset_dict = {
        "image": [Image.open(p).convert("RGB") for p in df["image_path"]],
        "caption": df["caption"].tolist()
    }
    return Dataset.from_dict(dataset_dict)

# Preprocess dataset
def preprocess_function(examples):
    inputs = processor(images=examples["image"], text=examples["caption"], padding="max_length", truncation=True, return_tensors="pt")
    return {
        "pixel_values": inputs["pixel_values"][0],
        "input_ids": inputs["input_ids"][0],
        "attention_mask": inputs["attention_mask"][0],
        "labels": inputs["input_ids"][0]
    }

# Load and preprocess dataset
dataset = load_custom_dataset("fancy_captions.csv")
processed_dataset = dataset.map(preprocess_function)

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip-finetuned-instagram",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=10,
    learning_rate=5e-5,
    fp16=True,
    remove_unused_columns=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor,
)

# Train the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./blip-finetuned-instagram")
processor.save_pretrained("./blip-finetuned-instagram")
