In [None]:
# ✅ Feature Extraction Function
def extract_biomedclip_features(image_paths, batch_size=4):

    features, processed_files = [], []


    for i in range(0, len(image_paths), batch_size):

        batch_paths = image_paths[i:i + batch_size]

        batch_images = []


        for img_path in batch_paths:

            # Load image with PIL

            img_pil = Image.open(img_path).convert("RGB")  # Ensure it is in RGB

            img_tensor = preprocess(img_pil).unsqueeze(0)  # Preprocess image

            batch_images.append(img_tensor)

            processed_files.append(img_path)


        # Stack batch and move to GPU

        batch_tensor = torch.cat(batch_images).to(device)

        with torch.no_grad():

            image_features = model.encode_image(batch_tensor)  # Extract features


        features.append(image_features.cpu().numpy())  # Collect features

        torch.cuda.empty_cache()  # Clear GPU memory after each batch

        gc.collect()  # Force garbage collection (helpful in case of memory issues)


    return np.concatenate(features, axis=0), processed_files


# ✅ Save features for train, valid, and test sets

def save_clip_features(split_name, image_list):

    print(f"⏳ Extracting features for {split_name} set...")

    features, processed = extract_biomedclip_features(image_list)

    filename = f"biomedclip_TN3K_{split_name}_features.npz"

    np.savez_compressed(filename, features=features, image_filenames=np.array(processed, dtype=str))

    print(f"✅ Saved {split_name} features! Shape: {features.shape}")


# Run extraction for each split

save_clip_features("train", full_train_x)  # Ensure train_x contains image paths

save_clip_features("valid", valid_x)

save_clip_features("test", test_x)


# ✅ Optional: Provide download links for the saved features

from IPython.display import FileLink, display


for split in ['train', 'valid', 'test']:

    display(FileLink(f"biomedclip_TN3K_{split}_features.npz"))