In [1]:
# Step 1: Install required packages
!pip install transformers datasets torchvision tqdm --quiet



[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    AdamW, get_linear_schedule_with_warmup
)
from datasets import load_dataset
from PIL import Image
import requests
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
class ImageCaptionDataset(Dataset):
    """Fixed Dataset class with proper tokenization"""
    
    def __init__(self, data, processor, max_length=128):
        self.data = data
        self.processor = processor
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        try:
            # Load image
            if isinstance(item['image'], str) and item['image'].startswith('http'):
                response = requests.get(item['image'], stream=True, timeout=10)
                response.raise_for_status()
                image = Image.open(response.raw)
            else:
                image = item['image'] if hasattr(item['image'], 'convert') else Image.open(item['image'])
            
            image = image.convert('RGB')
            
            # Process image
            pixel_values = self.processor(image, return_tensors="pt")["pixel_values"].squeeze(0)
            
            # Process text - FIXED: Proper tokenization
            text = item['text']
            
            # Tokenize text with proper parameters
            text_inputs = self.processor.tokenizer(
                text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            
            return {
                "pixel_values": pixel_values,
                "input_ids": text_inputs["input_ids"].squeeze(0),  # Remove batch dimension
                "attention_mask": text_inputs["attention_mask"].squeeze(0)  # Remove batch dimension
            }
            
        except Exception as e:
            print(f"Error processing item {idx}: {e}")
            # Return a dummy sample to avoid crashing
            dummy_image = Image.new('RGB', (224, 224), color='white')
            pixel_values = self.processor(dummy_image, return_tensors="pt")["pixel_values"].squeeze(0)
            
            dummy_text = "a photo"
            text_inputs = self.processor.tokenizer(
                dummy_text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            
            return {
                "pixel_values": pixel_values,
                "input_ids": text_inputs["input_ids"].squeeze(0),
                "attention_mask": text_inputs["attention_mask"].squeeze(0)
            }


In [4]:
class ImageCaptionGenerator:
    """Fixed Image Caption Generator with proper training"""
    
    def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load processor and model
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)
        
        print(f"Model loaded: {model_name}")
    
    def load_dataset(self, dataset_name="nlphuji/flickr30k", split="test", sample_size=100):
        """Load and prepare dataset - reduced sample size for stability"""
        print(f"Loading dataset: {dataset_name}")
        
        try:
            # Load dataset
            dataset = load_dataset(dataset_name, split=split)
            
            # Sample smaller subset for stability
            if len(dataset) > sample_size:
                indices = np.random.choice(len(dataset), sample_size, replace=False)
                dataset = dataset.select(indices)
            
            # Convert to list for easier manipulation
            data = []
            for item in tqdm(dataset, desc="Processing dataset"):
                try:
                    # Handle different dataset formats
                    if 'caption' in item:
                        caption = item['caption']
                    elif 'text' in item:
                        caption = item['text']
                    elif 'captions' in item:
                        # Take first caption if multiple exist
                        caption = item['captions'][0] if isinstance(item['captions'], list) else item['captions']
                    else:
                        caption = "A photo"
                    
                    # Clean caption text - IMPORTANT FIX
                    if isinstance(caption, list):
                        caption = caption[0]
                    caption = str(caption).strip()
                    
                    if len(caption) > 0:  # Only add if caption is not empty
                        data.append({
                            'image': item['image'],
                            'text': caption
                        })
                        
                except Exception as e:
                    print(f"Skipping problematic item: {e}")
                    continue
            
            print(f"Dataset loaded with {len(data)} valid samples")
            return data
            
        except Exception as e:
            print(f"Error loading dataset {dataset_name}: {e}")
            print("Creating sample data for demonstration...")
            
            # Create sample data if dataset loading fails
            sample_urls = [
                "https://images.unsplash.com/photo-1518717758536-85ae29035b6d?w=400",
                "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=400", 
                "https://images.unsplash.com/photo-1571019613454-1cb2f99b2d8b?w=400",
                "https://images.unsplash.com/photo-1552053831-71594a27632d?w=400",
                "https://images.unsplash.com/photo-1574158622682-e40e69881006?w=400"
            ]
            
            sample_captions = [
                "A dog sitting on grass",
                "Mountain landscape with snow",
                "City skyline at night",
                "A cute puppy playing",
                "Beautiful nature scene"
            ]
            
            data = []
            for url, caption in zip(sample_urls, sample_captions):
                data.append({'image': url, 'text': caption})
            
            print(f"Created sample dataset with {len(data)} samples")
            return data
    
    def split_data(self, data, train_ratio=0.8, random_state=42):
        """Split data into train and test sets"""
        train_data, test_data = train_test_split(
            data, 
            train_size=train_ratio, 
            random_state=random_state,
            shuffle=True
        )
        
        print(f"Train samples: {len(train_data)}")
        print(f"Test samples: {len(test_data)}")
        
        return train_data, test_data
    
    def create_dataloaders(self, train_data, test_data, batch_size=2):
        """Create data loaders with smaller batch size for stability"""
        train_dataset = ImageCaptionDataset(train_data, self.processor)
        test_dataset = ImageCaptionDataset(test_data, self.processor)
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=0,  # Set to 0 to avoid multiprocessing issues
            pin_memory=False,  # Disable for stability
            drop_last=True  # Drop incomplete batches
        )
        
        test_loader = DataLoader(
            test_dataset, 
            batch_size=batch_size, 
            shuffle=False,
            num_workers=0,
            pin_memory=False,
            drop_last=True
        )
        
        return train_loader, test_loader
    
    def train(self, train_loader, test_loader, epochs=2, lr=5e-5):
        """Fixed training function with proper error handling"""
        print("Starting training...")
        
        # Put model in training mode
        self.model.train()
        
        # Optimizer and scheduler
        optimizer = AdamW(self.model.parameters(), lr=lr, weight_decay=0.01)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )
        
        # Training loop
        train_losses = []
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            successful_batches = 0
            
            progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')
            
            for batch_idx, batch in enumerate(progress_bar):
                try:
                    # Move to device
                    pixel_values = batch['pixel_values'].to(self.device)
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    
                    # Debug: Print shapes
                    if batch_idx == 0:
                        print(f"Pixel values shape: {pixel_values.shape}")
                        print(f"Input IDs shape: {input_ids.shape}")
                        print(f"Attention mask shape: {attention_mask.shape}")
                    
                    # Forward pass - FIXED: Use proper parameters for BLIP
                    outputs = self.model(
                        pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=input_ids  # Use input_ids as labels for generation
                    )
                    
                    loss = outputs.loss
                    
                    if torch.isnan(loss):
                        print(f"NaN loss detected at batch {batch_idx}, skipping...")
                        continue
                    
                    total_loss += loss.item()
                    successful_batches += 1
                    
                    # Backward pass
                    optimizer.zero_grad()
                    loss.backward()
                    
                    # Gradient clipping for stability
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    
                    optimizer.step()
                    scheduler.step()
                    
                    # Update progress bar
                    progress_bar.set_postfix({
                        'loss': loss.item(),
                        'avg_loss': total_loss / max(successful_batches, 1)
                    })
                    
                except Exception as e:
                    print(f"Error in batch {batch_idx}: {e}")
                    continue
            
            if successful_batches > 0:
                avg_loss = total_loss / successful_batches
                train_losses.append(avg_loss)
                print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f} ({successful_batches} successful batches)")
            else:
                print(f"Epoch {epoch+1} - No successful batches!")
                
            # Evaluate on test set
            if test_loader and successful_batches > 0:
                try:
                    test_loss = self.evaluate(test_loader)
                    print(f"Epoch {epoch+1} - Test Loss: {test_loss:.4f}")
                except Exception as e:
                    print(f"Evaluation error: {e}")
        
        return train_losses
    
    def evaluate(self, test_loader):
        """Evaluate the model"""
        self.model.eval()
        total_loss = 0
        successful_batches = 0
        
        with torch.no_grad():
            for batch in tqdm(test_loader, desc='Evaluating'):
                try:
                    pixel_values = batch['pixel_values'].to(self.device)
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    
                    outputs = self.model(
                        pixel_values=pixel_values,
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=input_ids
                    )
                    
                    if not torch.isnan(outputs.loss):
                        total_loss += outputs.loss.item()
                        successful_batches += 1
                        
                except Exception as e:
                    print(f"Evaluation batch error: {e}")
                    continue
        
        return total_loss / max(successful_batches, 1)
    
    def generate_caption(self, image_path_or_url, max_length=50):
        """Generate caption for a single image"""
        self.model.eval()
        
        try:
            # Load image
            if isinstance(image_path_or_url, str) and image_path_or_url.startswith('http'):
                response = requests.get(image_path_or_url, stream=True, timeout=10)
                response.raise_for_status()
                image = Image.open(response.raw)
            else:
                image = Image.open(image_path_or_url)
            
            image = image.convert('RGB')
            
            # Process image
            inputs = self.processor(image, return_tensors="pt").to(self.device)
            
            # Generate caption
            with torch.no_grad():
                generated_ids = self.model.generate(
                    **inputs,
                    max_length=max_length,
                    num_beams=4,
                    early_stopping=True,
                    do_sample=False,
                    temperature=1.0
                )
            
            # Decode caption
            caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
            
            return caption
            
        except Exception as e:
            return f"Error generating caption: {str(e)}"
    
    def save_model(self, path="./fine_tuned_caption_model"):
        """Save the fine-tuned model"""
        try:
            self.model.save_pretrained(path)
            self.processor.save_pretrained(path)
            print(f"Model saved to {path}")
        except Exception as e:
            print(f"Error saving model: {e}")
    
    def load_model(self, path="./fine_tuned_caption_model"):
        """Load a fine-tuned model"""
        try:
            self.model = BlipForConditionalGeneration.from_pretrained(path)
            self.processor = BlipProcessor.from_pretrained(path)
            self.model.to(self.device)
            print(f"Model loaded from {path}")
        except Exception as e:
            print(f"Error loading model from {path}: {e}")

In [5]:
def test_on_new_images():
    """Example function showing how to test on new images"""
    
    # Load the trained model
    caption_gen = ImageCaptionGenerator()
    
    # Option 1: Load your fine-tuned model (if available)
    try:
        caption_gen.load_model("./fine_tuned_caption_model")
        print("Loaded fine-tuned model")
    except:
        print("Using pre-trained model (fine-tuned model not found)")
    
    # Test on different types of images
    
    # Example 1: Test on a local image file
    print("\n" + "="*50)
    print("Testing on local image:")
    try:
        local_image_path = "image_transformations.png"  # Replace with your image path
        caption = caption_gen.generate_caption(local_image_path)
        print(f"Caption: {caption}")
    except Exception as e:
        print(f"Error with local image: {e}")
        print("Make sure to provide a valid image path")
    
    # Example 2: Test on an image URL
    print("\n" + "="*50)
    print("Testing on image from URL:")
    try:
        image_url = "https://images.unsplash.com/photo-1518717758536-85ae29035b6d?w=400"
        caption = caption_gen.generate_caption(image_url)
        print(f"Image URL: {image_url}")
        print(f"Caption: {caption}")
    except Exception as e:
        print(f"Error with URL image: {e}")
    
    # Example 3: Test multiple images
    print("\n" + "="*50)
    print("Testing on multiple images:")
    
    test_images = [
        "https://images.unsplash.com/photo-1552053831-71594a27632d?w=400",  # Dog
        "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=400",  # Mountain
        "https://images.unsplash.com/photo-1571019613454-1cb2f99b2d8b?w=400",  # City
    ]
    
    for i, img_url in enumerate(test_images, 1):
        try:
            caption = caption_gen.generate_caption(img_url, max_length=30)
            print(f"Image {i}: {caption}")
        except Exception as e:
            print(f"Error with image {i}: {e}")

def interactive_caption_generator():
    """Interactive function to test images with user input"""
    
    caption_gen = ImageCaptionGenerator()
    
    # Try to load fine-tuned model
    try:
        caption_gen.load_model("./fine_tuned_caption_model")
        print("‚úì Loaded fine-tuned model")
    except:
        print("‚Ñπ Using pre-trained model")
    
    print("\n" + "="*60)
    print("üñºÔ∏è  INTERACTIVE IMAGE CAPTION GENERATOR")
    print("="*60)
    print("Enter image paths or URLs (type 'quit' to exit)")
    print("Examples:")
    print("  - Local file: /path/to/image.jpg")
    print("  - URL: https://example.com/image.jpg")
    print("="*60)
    
    while True:
        user_input = input("\nEnter image path or URL: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye! üëã")
            break
        
        if not user_input:
            print("Please enter a valid path or URL")
            continue
        
        try:
            print("üîÑ Generating caption...")
            caption = caption_gen.generate_caption(user_input)
            print(f"üìù Caption: {caption}")
            
            # Ask for caption length preference
            custom_length = input("Want different length? Enter max words (or press Enter): ").strip()
            if custom_length.isdigit():
                caption = caption_gen.generate_caption(user_input, max_length=int(custom_length))
                print(f"üìù New Caption: {caption}")
                
        except Exception as e:
            print(f"‚ùå Error: {e}")
            print("Make sure the image path/URL is valid and accessible")

def batch_caption_generator(image_folder_path):
    """Generate captions for all images in a folder"""
    import os
    from pathlib import Path
    
    caption_gen = ImageCaptionGenerator()
    
    # Load model
    try:
        caption_gen.load_model("./fine_tuned_caption_model")
        print("‚úì Loaded fine-tuned model")
    except:
        print("‚Ñπ Using pre-trained model")
    
    # Get all image files
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(Path(image_folder_path).glob(f'*{ext}'))
        image_files.extend(Path(image_folder_path).glob(f'*{ext.upper()}'))
    
    if not image_files:
        print(f"No images found in {image_folder_path}")
        return
    
    print(f"Found {len(image_files)} images")
    
    # Generate captions
    results = []
    for img_path in tqdm(image_files, desc="Generating captions"):
        try:
            caption = caption_gen.generate_caption(str(img_path))
            results.append({
                'image': img_path.name,
                'caption': caption
            })
        except Exception as e:
            print(f"Error with {img_path.name}: {e}")
    
    # Save results
    output_file = Path(image_folder_path) / "captions.txt"
    with open(output_file, 'w') as f:
        for result in results:
            f.write(f"{result['image']}: {result['caption']}\n")
    
    print(f"Captions saved to {output_file}")
    
    # Display results
    print("\nGenerated Captions:")
    print("-" * 50)
    for result in results:
        print(f"{result['image']}: {result['caption']}")


In [6]:
def main():
    """Fixed main training function"""
    print("=== Image Caption Generator Training ===")
    
    # Initialize
    caption_gen = ImageCaptionGenerator()
    
    # Load smaller dataset for stability
    data = caption_gen.load_dataset(sample_size=200)  # Very small for testing
    
    if len(data) < 4:
        print("Not enough data for training!")
        return
    
    # Split data
    train_data, test_data = caption_gen.split_data(data, train_ratio=0.8)
    
    # Create data loaders with small batch size
    train_loader, test_loader = caption_gen.create_dataloaders(
        train_data, test_data, batch_size=1  # Start with batch size 1
    )
    
    print(f"Training batches: {len(train_loader)}")
    print(f"Test batches: {len(test_loader)}")
    
    # Train the model
    try:
        train_losses = caption_gen.train(
            train_loader, test_loader, 
            epochs=2,  # Reduced epochs for testing
            lr=5e-5
        )
        
        # Save the model
        caption_gen.save_model("./fine_tuned_caption_model")
        
        # Test generation
        print("\n=== Testing Caption Generation ===")
        test_url = "https://images.unsplash.com/photo-1518717758536-85ae29035b6d?w=400"
        caption = caption_gen.generate_caption(test_url)
        print(f"Generated caption: {caption}")
        
    except Exception as e:
        print(f"Training error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

=== Image Caption Generator Training ===
Using device: cpu
Model loaded: Salesforce/blip-image-captioning-base
Loading dataset: nlphuji/flickr30k


Processing dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:01<00:00, 192.66it/s]


Dataset loaded with 200 valid samples
Train samples: 160
Test samples: 40
Training batches: 160
Test batches: 40
Starting training...


Epoch 1/2:   0%|                                                                               | 0/160 [00:00<?, ?it/s]

Pixel values shape: torch.Size([1, 3, 384, 384])
Input IDs shape: torch.Size([1, 128])
Attention mask shape: torch.Size([1, 128])


Epoch 1/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 160/160 [18:15<00:00,  6.84s/it, loss=0.833, avg_loss=4.25]


Epoch 1 - Average Loss: 4.2512 (160 successful batches)


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [01:13<00:00,  1.85s/it]


Epoch 1 - Test Loss: 0.6390


Epoch 2/2:   0%|                                                                               | 0/160 [00:00<?, ?it/s]

Pixel values shape: torch.Size([1, 3, 384, 384])
Input IDs shape: torch.Size([1, 128])
Attention mask shape: torch.Size([1, 128])


Epoch 2/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 160/160 [-5:28:00<00:00, -0.01it/s, loss=0.158, avg_loss=0.373]


Epoch 2 - Average Loss: 0.3734 (160 successful batches)


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [01:16<00:00,  1.90s/it]


Epoch 2 - Test Loss: 0.4687
Model saved to ./fine_tuned_caption_model

=== Testing Caption Generation ===
Generated caption: a brown dog sticks out from under a tree while looking at the camera


In [7]:
test_on_new_images()

Using device: cpu
Model loaded: Salesforce/blip-image-captioning-base
Model loaded from ./fine_tuned_caption_model
Loaded fine-tuned model

Testing on local image:
Caption: four different colored squares are shown on a black background with white dots and red, green, blue, yellow

Testing on image from URL:
Image URL: https://images.unsplash.com/photo-1518717758536-85ae29035b6d?w=400
Caption: a brown dog sticks out from under a tree while looking at the camera

Testing on multiple images:
Image 1: a brown and white dog is holding a yellow stick in its mouth while sitting on a stone path
Image 2: a mountain range with snow and mountains in the background
Image 3: a woman in white tank top and orange shorts is sitting on the floor and using her cellphone


In [8]:
interactive_caption_generator()

Using device: cpu
Model loaded: Salesforce/blip-image-captioning-base
Model loaded from ./fine_tuned_caption_model
‚úì Loaded fine-tuned model

üñºÔ∏è  INTERACTIVE IMAGE CAPTION GENERATOR
Enter image paths or URLs (type 'quit' to exit)
Examples:
  - Local file: /path/to/image.jpg
  - URL: https://example.com/image.jpg



Enter image path or URL:  https://onlinepngtools.com/images/examples-onlinepngtools/red-petaled-flower.png


üîÑ Generating caption...
üìù Caption: a pink flowered tree with many pink flowers on it


Want different length? Enter max words (or press Enter):  https://themeisle.com/blog/wp-content/uploads/2024/06/Online-Image-Optimizer-Test-Image-PNG-Version.png

Enter image path or URL:  https://tinypng.com/static/images/boat-compressed.jpg


üîÑ Generating caption...
üìù Caption: a boat is in the water with the sun setting behind it


Want different length? Enter max words (or press Enter):  quit

Enter image path or URL:  quit


Goodbye! üëã


In [15]:
batch_caption_generator(r"C:\Users\InaequoSolutions-PC\Downloads\images_G-AI")

Using device: cpu
Model loaded: Salesforce/blip-image-captioning-base
Model loaded from ./fine_tuned_caption_model
‚úì Loaded fine-tuned model
Found 2 images


Generating captions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:24<00:00, 12.39s/it]

Captions saved to C:\Users\InaequoSolutions-PC\Downloads\images_G-AI\captions.txt

Generated Captions:
--------------------------------------------------
boat-compressed.jpg: a boat is in the water with the sun setting behind it
boat-compressed.jpg: a boat is in the water with the sun setting behind it



