In [1]:
import torch
import numpy as np
from transformers import AutoFeatureExtractor, ViTModel
from PIL import Image, ImageEnhance
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
from torchvision import transforms
from torchvision.transforms import RandomResizedCrop


In [2]:
# Load train and test dataframes
train_df = pd.read_csv("/kaggle/input/visual-taxonomy/train.csv")
test_df = pd.read_csv("/kaggle/input/visual-taxonomy/test.csv")
train_images="/kaggle/input/visual-taxonomy/train_images/"
test_images="/kaggle/input/visual-taxonomy/test_images/"
train_df["image_path"]=train_df["id"].apply(lambda Id: train_images+str(Id).zfill(6)+".jpg")
test_df["image_path"]=test_df["id"].apply(lambda Id: test_images+str(Id).zfill(6)+".jpg")

In [3]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image, ImageEnhance
import torch
import pandas as pd
from tqdm import tqdm

# Load the model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Helper function to transform the image
def transform_image(image):
    # Get image dimensions
    width, height = image.size
    
    # Crop 15% from each side
    left = width * 0.15
    right = width * 0.85
    top = height * 0.15
    bottom = height * 0.85
    image = image.crop((left, top, right, bottom))
    
    # Increase brightness by 10%
    enhancer_brightness = ImageEnhance.Brightness(image)
    image = enhancer_brightness.enhance(1.1)
    
    # Increase contrast by 10%
    enhancer_contrast = ImageEnhance.Contrast(image)
    image = enhancer_contrast.enhance(1.1)
    
    return image

# Helper function to get embeddings from image paths with transformations
def get_embeddings_from_df(df):
    embeddings = []
    
    for image_path in tqdm(df['image_path'], desc="Processing images"):
        # Load and transform the image
        image = Image.open(image_path).convert("RGB")
        transformed_image = transform_image(image)
        
        # Process and get embeddings
        inputs = processor(images=transformed_image, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        
        embeddings.append(image_features.cpu().squeeze().numpy())
    
    return embeddings

# Example usage with train_df and test_df
df1_embeddings = get_embeddings_from_df(train_df)
df2_embeddings = get_embeddings_from_df(test_df)

# Convert embeddings to dataframes and save them as CSV
df1_embeddings_df = pd.DataFrame(df1_embeddings)
df2_embeddings_df = pd.DataFrame(df2_embeddings)

df1_embeddings_df.to_csv("train_clip.csv", index=False)
df2_embeddings_df.to_csv("test_clip.csv", index=False)


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Processing images: 100%|██████████| 70213/70213 [26:50<00:00, 43.59it/s]
Processing images: 100%|██████████| 30205/30205 [12:51<00:00, 39.15it/s]
