In [21]:
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import numpy as np
from tqdm import tqdm

# Load the Dataset
data_path = 'train.csv'
data = pd.read_csv(data_path)

print("Dataset Info:")
print(data.info())
print(data.head())

def process_image(image_link):
    try:
        response = requests.get(image_link, stream=True, timeout=10)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img = img.resize((224, 224))  # Resize to a fixed size
            img_array = np.array(img) / 255.0  # Normalize pixel values to [0, 1]
            return img_array
        else:
            print(f"Failed to download {image_link}: HTTP status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error processing {image_link}: {e}")
        return None

def process_images_in_batches(image_links, batch_size=32):
    for i in range(0, len(image_links), batch_size):
        batch_links = image_links[i:i+batch_size]
        batch_images = []
        for link in tqdm(batch_links, desc=f"Processing batch {i//batch_size + 1}"):
            img_array = process_image(link)
            if img_array is not None:
                batch_images.append(img_array)
        yield np.array(batch_images)

# Process all images in batches
image_links = data['image_link'].values
batch_generator = process_images_in_batches(image_links)

total_images = 0
total_batches = 0

for batch_num, batch in enumerate(batch_generator, 1):
    print(f"Batch {batch_num}:")
    print(f"  Shape: {batch.shape}")
    print(f"  Number of images: {batch.shape[0]}")
    print(f"  Min value: {batch.min():.4f}")
    print(f"  Max value: {batch.max():.4f}")
    print(f"  Mean value: {batch.mean():.4f}")
    print(f"  Standard deviation: {batch.std():.4f}")
    print()
    
    total_images += batch.shape[0]
    total_batches += 1

print("Processing complete.")
print(f"Total number of batches processed: {total_batches}")
print(f"Total number of images processed: {total_images}")
print(f"Expected number of images: {len(image_links)}")
if total_images != len(image_links):
    print(f"Warning: {len(image_links) - total_images} images were not processed successfully.")

Processing batch 8: 100%|██████████| 32/32 [00:45<00:00,  1.43s/it]


Batch 8:
  Shape: (32, 224, 224, 3)
  Number of images: 32
  Min value: 0.0000
  Max value: 1.0000
  Mean value: 0.7710
  Standard deviation: 0.2814



Processing batch 9:  34%|███▍      | 11/32 [00:14<00:19,  1.09it/s]