In [None]:
# Make sure your utils.py is accessible
import sys
 # Adjust path if needed
from utils import download_images
IMAGE_FOLDER_ON_DRIVE = '/content/drive/MyDrive/amazon_ml_images/testing'
# Load your train.csv from Drive
import pandas as pd
train_df = pd.read_csv('/content/drive/MyDrive/test.csv')

# # Download directly to your Google Drive
print(f"Downloading images directly to your Google Drive at: {IMAGE_FOLDER_ON_DRIVE}")
download_images(train_df['image_link'], IMAGE_FOLDER_ON_DRIVE)
print("Download to Drive complete!")

Downloading images directly to your Google Drive at: /content/drive/MyDrive/amazon_ml_images/testing
Using 2 parallel workers for download.


 42%|████▏     | 31532/75000 [02:07<16:10, 44.78it/s]



 56%|█████▌    | 42044/75000 [05:12<08:32, 64.30it/s]

ERROR: Could not download https://m.media-amazon.com/images/I/813CjSgHj0S.jpg after multiple attempts.


100%|██████████| 75000/75000 [17:59<00:00, 69.50it/s]

Download to Drive complete!





In [None]:
import os
images=os.listdir(IMAGE_FOLDER_ON_DRIVE)
len(images)

72221

In [None]:
images=os.listdir(IMAGE_FOLDER_ON_DRIVE)

import os
from PIL import Image
from tqdm import tqdm

IMAGE_FOLDER =IMAGE_FOLDER_ON_DRIVE
DELETE_BAD_FILES = True
# --- SCRIPT ---
bad_images = []
image_files = os.listdir(IMAGE_FOLDER) # Use the directory path here
print(f"Scanning {len(image_files)} files in '{IMAGE_FOLDER}'...")

# Use tqdm for a progress bar
for filename in tqdm(image_files):
    # We only care about image files
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    # Corrected: join the directory path with the filename
    file_path = os.path.join(IMAGE_FOLDER, filename)

    try:
        # Check if the file is empty (0 KB)
        if os.path.getsize(file_path) == 0:
            bad_images.append(filename)
            continue # No need to try opening it

        # Try to open the image file
        with Image.open(file_path) as img:
            # You can add more checks here, like verifying it's a valid RGB image
            img.verify()

    except Exception as e:
        # If any error occurs (file is not an image, corrupted, etc.), add it to the list
        print(f"\nFound bad image: {filename}, Error: {e}")
        bad_images.append(filename)

print("\n--- Scan Complete ---")
print(f"Found {len(bad_images)} corrupted or empty image files.")

# --- Optional: Delete the bad files ---
if DELETE_BAD_FILES and bad_images:
    print(f"Deleting {len(bad_images)} bad files...")
    for filename in bad_images:
        os.remove(os.path.join(IMAGE_FOLDER, filename)) # Use the directory path here
    print("Deletion complete.")
else:
    print("To delete bad files, set DELETE_BAD_FILES = True and re-run the cell.")

KeyboardInterrupt: 

In [None]:
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import keras as k
from keras.models import Sequential
from keras.layers import Conv2D,BatchNormalization,Activation,GlobalAveragePooling2D,Dropout,Dense,MaxPooling2D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input, decode_predictions

In [None]:
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model


TRAIN_CSV_PATH = '/content/drive/MyDrive/test.csv'
IMAGE_FOLDER_ON_DRIVE = '/content/drive/MyDrive/amazon_ml_images/testing'
SAVE_PATH_FOR_VECTORS = '/content/drive/MyDrive/test_efficientnet_vectors.npy' # New save file

# --- 2. Build the Complete Feature Extractor Model ---
# Load the base model without its top classification layers
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Freeze the layers so their weights don't change
base_model.trainable = False

# Add a Global Average Pooling layer to get a 1D vector
# This is the crucial next step
x = base_model.output
pooling_layer = GlobalAveragePooling2D()(x)

# Create the final model
model = Model(inputs=base_model.input, outputs=pooling_layer)
print("EfficientNetB0 feature extractor model built successfully.")
model.summary() # Optional: see the model structure

# --- 3. Load your data and create the full image paths ---
df = pd.read_csv(TRAIN_CSV_PATH)
image_paths = df['sample_id'].apply(lambda x: os.path.join(IMAGE_FOLDER_ON_DRIVE, f"{x}.jpg")).tolist()
print(f"Found {len(image_paths)} image paths to process.")

# --- 4. Function to process images in batches ---
def create_image_embeddings(paths, batch_size=64):
    all_vectors = []

    for i in tqdm(range(0, len(paths), batch_size), desc="Processing Batches"):
        batch_paths = paths[i:i + batch_size]
        batch_images_arrays = []

        for img_path in batch_paths:
            if not os.path.exists(img_path):
                # EfficientNetB0 outputs 1280 features
                batch_images_arrays.append(np.zeros((224, 224, 3)))
                continue

            try:
                img = image.load_img(img_path, target_size=(224, 224))
                img_array = image.img_to_array(img)
                batch_images_arrays.append(img_array)
            except Exception as e:
                batch_images_arrays.append(np.zeros((224, 224, 3)))

        # Preprocess the entire batch at once
        batch_to_predict = tf.keras.applications.efficientnet.preprocess_input(np.array(batch_images_arrays))

        # Get feature vectors for the entire batch
        batch_vectors = model.predict(batch_to_predict, verbose=0)
        all_vectors.extend(batch_vectors)

    return np.array(all_vectors)

# --- 5. Generate and Save the Embeddings ---
print("Starting EfficientNet vector generation...")
test_image_vectors = create_image_embeddings(image_paths)

# CRITICAL STEP: Save the new vectors
np.save(SAVE_PATH_FOR_VECTORS, test_image_vectors)

print("\n--- EfficientNet Vector Generation Complete! ---")
# The output shape will be (num_images, 1280) for EfficientNetB0
print(f"Shape of saved vectors: {test_image_vectors.shape}")
print(f"Vectors saved successfully to: {SAVE_PATH_FOR_VECTORS}")

EfficientNetB0 feature extractor model built successfully.


Found 75000 image paths to process.
Starting EfficientNet vector generation...


Processing Batches: 100%|██████████| 1172/1172 [11:27<00:00,  1.71it/s]



--- EfficientNet Vector Generation Complete! ---
Shape of saved vectors: (75000, 1280)
Vectors saved successfully to: /content/drive/MyDrive/test_efficientnet_vectors.npy


In [None]:
import numpy as np

# Load the numpy array from the file path
test_image_vectors = np.load('/content/drive/MyDrive/test_efficientnet_vectors.npy')

# Optionally, you can print the shape to verify it's loaded correctly
print("Shape of loaded vectors:", test_image_vectors.shape)

Shape of loaded vectors: (75000, 1280)


In [None]:
test_image_vectors

array([[-0.070159  , -0.09937076, -0.13313173, ..., -0.19447373,
        -0.0501325 ,  0.05287945],
       [-0.070159  , -0.09937076, -0.13313173, ..., -0.19447373,
        -0.0501325 ,  0.05287945],
       [-0.070159  , -0.09937076, -0.13313173, ..., -0.19447373,
        -0.0501325 ,  0.05287945],
       ...,
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931],
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931],
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931]], dtype=float32)

In [None]:
zero_count = np.count_nonzero(test_image_vectors> 0)



print(f"Number of zero values in train_image_vectors: {zero_count}")

Number of zero values in train_image_vectors: 7875000


In [None]:
file_path='/content/drive/MyDrive/test_efficientnet_vectors.npy'
    # Get the size in bytes
file_size_bytes = os.path.getsize(file_path)
print(f"The size of the file is: {file_size_bytes} bytes")

The size of the file is: 384000128 bytes


In [None]:
train_image_vectors

array([[-0.07015897, -0.0993707 , -0.13313167, ..., -0.19447377,
        -0.05013252,  0.05287931],
       [-0.07015897, -0.0993707 , -0.13313167, ..., -0.19447377,
        -0.05013252,  0.05287931],
       [-0.07015897, -0.0993707 , -0.13313167, ..., -0.19447377,
        -0.05013252,  0.05287931],
       ...,
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931],
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931],
       [-0.07015891, -0.09937071, -0.13313189, ..., -0.19447377,
        -0.05013247,  0.05287931]], dtype=float32)

In [None]:
from google.colab import files

files.download('/content/drive/MyDrive/test_efficientnet_vectors.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>