In [1]:
import os
import pandas as pd
from tqdm import tqdm
import torch
torch.__version__

'2.4.1+cu121'

In [2]:
# !pip install open-clip-torch --index-url https://pypi.org/simple/
!pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-2.29.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-2.29.0


In [3]:
DATA_PATH = '/kaggle/input/'
IMAGE_PATH = DATA_PATH + 'flickr30k_images'
df = pd.read_csv(DATA_PATH + 'captions.txt', delimiter=",")
df['comment'] = df['comment'].str.lstrip()
print(df.loc[19999, 'comment_number'], df.loc[19999, 'comment'], sep=' | ')

4 | A dog runs across the grass .


In [4]:
df.columns = ['image', 'caption_number', 'caption']
df

Unnamed: 0,image,caption_number,caption
0,1000092795.jpg,0,Two young guys with shaggy hair look at their ...
1,1000092795.jpg,1,Two young White males are outside near many b...
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .
...,...,...,...
158910,998845445.jpg,0,A man in shorts and a Hawaiian shirt leans ove...
158911,998845445.jpg,1,A young man hanging over the side of a boat w...
158912,998845445.jpg,2,A man is leaning off of the side of a blue and...
158913,998845445.jpg,3,A man riding a small boat in a harbor with fo...


In [5]:
from PIL import Image
import open_clip
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
from open_clip.factory import create_model_and_transforms
import pickle

In [6]:
# Макс
# df1 = df.iloc[0:80000]
# df1

In [7]:
# Илюха
df2 = df.iloc[80000:]
df2

Unnamed: 0,image,caption_number,caption
80000,3642088668.jpg,0,A stewardess on an airplane pushes a cart down...
80001,3642088668.jpg,1,A brunette flight attendant in a red uniform i...
80002,3642088668.jpg,2,A flight attendant is pushing a beverage cart ...
80003,3642088668.jpg,3,An airline stewardess is carefully rolling her...
80004,3642088668.jpg,4,Flight attendant in red pushes drink cart thro...
...,...,...,...
158910,998845445.jpg,0,A man in shorts and a Hawaiian shirt leans ove...
158911,998845445.jpg,1,A young man hanging over the side of a boat w...
158912,998845445.jpg,2,A man is leaning off of the side of a blue and...
158913,998845445.jpg,3,A man riding a small boat in a harbor with fo...


In [8]:
class Flickr30kImagesDataset(Dataset):
    # Initialize CLIP model and preprocessing transforms
    clip_model, _, clip_preprocess = create_model_and_transforms(
        'ViT-bigG-14-quickgelu', pretrained='metaclip_fullcc', device='cuda'
    )
    clip_model.eval()  # Set to evaluation mode
    
    def __init__(self, image_dir: str, captions_df: pd.DataFrame, prefix_length: int, normalize_prefix=False):
        """
        image_dir: Path to the directory containing images.
        captions_df: DataFrame with 'image' and 'caption' columns.
        prefix_length: Length of the prefix for the embeddings.
        normalize_prefix: Whether to normalize the CLIP embeddings.
        """
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.prefix_length = prefix_length
        self.normalize_prefix = normalize_prefix

        # Precompute tokens and CLIP embeddings
        self.caption_tokens = []
        self.caption2embedding = []
        self.max_seq_len = 0
        self._precompute_embeddings_and_tokens(image_dir, captions_df)

    def _precompute_embeddings_and_tokens(self, image_dir, captions_df):
        """Precomputes tokens for captions and CLIP embeddings for images."""
        for idx, row in tqdm(captions_df.iterrows()):
            image_filename = row['image']
            caption = row['caption']

            # Tokenize caption
            tokens = torch.tensor(self.tokenizer.encode(caption), dtype=torch.int64)
            self.caption_tokens.append(tokens)
            self.max_seq_len = max(self.max_seq_len, tokens.shape[0])

            # Process image to get CLIP embedding
            # image_path = os.path.join(image_dir, image_filename)
            # image = Image.open(image_path)
            # image_tensor = self.clip_preprocess(image).unsqueeze(0)  # Add batch dimension

            # with torch.no_grad():
            #     embedding = self.clip_model.encode_image(image_tensor).squeeze(0)
            # if self.normalize_prefix:
            #     embedding = embedding / embedding.norm(2, -1)
            # self.caption2embedding.append(embedding)

            if idx % 5 == 0:
            # Process image to get CLIP embedding
                image_path = os.path.join(image_dir, image_filename)
                image = Image.open(image_path)
                image_tensor = Flickr30kImagesDataset.clip_preprocess(image).unsqueeze(0).to('cuda')  # Add batch dimension
    
                with torch.no_grad():
                    embedding = Flickr30kImagesDataset.clip_model.encode_image(image_tensor).squeeze(0)
                if self.normalize_prefix:
                    embedding = embedding / embedding.norm(2, -1)
                self.caption2embedding.append(embedding)
            else:
                self.caption2embedding.append(self.caption2embedding[-1].clone())

    def pad_tokens(self, idx):
        """Pads tokens to the maximum sequence length and creates a mask."""
        tokens = self.caption_tokens[idx]
        padding = self.max_seq_len - tokens.shape[0]
        if padding > 0:
            tokens = torch.cat((tokens, torch.zeros(padding, dtype=torch.int64) - 1))
            self.caption_tokens[idx] = tokens
        elif padding < 0:
            tokens = tokens[:self.max_seq_len]
            self.caption_tokens[idx] = tokens
        mask = tokens.ge(0)  # Mask is zero where we are out of sequence
        tokens[~mask] = 0
        mask = mask.float()
        mask = torch.cat((torch.ones(self.prefix_length), mask), dim=0)  # Add prefix mask
        return tokens, mask

    def __len__(self):
        return len(self.caption_tokens)

    def __getitem__(self, idx):
        tokens, mask = self.pad_tokens(idx)
        prefix_embedding = self.caption2embedding[idx]
        return tokens, mask, prefix_embedding

    def to_pickle(self, pickle_path: str):
        """Saves the dataset information, including caption tokens, embeddings, and parameters, to a pickle file."""
        with open(pickle_path, 'wb') as f:
            pickle.dump({
                'caption_tokens': self.caption_tokens,
                'caption2embedding': self.caption2embedding,
                'max_seq_len': self.max_seq_len,
                'prefix_length': self.prefix_length,
                'normalize_prefix': self.normalize_prefix
            }, f)

    @classmethod
    def from_pickle(cls, pickle_path: str):
        """Loads the dataset information, including caption tokens, embeddings, and parameters, from a pickle file."""
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
        dataset = cls.__new__(cls)  # Create an uninitialized instance
        dataset.caption_tokens = data['caption_tokens']
        dataset.caption2embedding = data['caption2embedding']
        dataset.max_seq_len = data['max_seq_len']
        dataset.prefix_length = data['prefix_length']
        dataset.normalize_prefix = data['normalize_prefix']
        return dataset


open_clip_model.safetensors:   0%|          | 0.00/10.2G [00:00<?, ?B/s]

In [9]:
def merge(first: Flickr30kImagesDataset, second: Flickr30kImagesDataset):
    dataset = Flickr30kImagesDataset.__new__(Flickr30kImagesDataset)  # Create an uninitialized instance
    dataset.caption_tokens = first.caption_tokens + second.caption_tokens
    dataset.caption2embedding = first.caption2embedding + second.caption2embedding
    dataset.max_seq_len = max(first.max_seq_len, second.max_seq_len)
    dataset.prefix_length = first.prefix_length
    dataset.normalize_prefix = first.normalize_prefix
    return dataset

In [10]:
# Макс
# dataset1 = Flickr30kImagesDataset(
#     IMAGE_PATH,
#     df1,
#     5,
# )

In [11]:
# Илюха
dataset2 = Flickr30kImagesDataset(
    IMAGE_PATH,
    df2,
    5,
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

78915it [51:28, 25.55it/s]


In [12]:
# dataset1.to_pickle("Flickr30kImagesDatasetSave1") # Макс

In [13]:
dataset2.to_pickle("Flickr30kImagesDatasetSave2") # Илюха

## Теперь в Output есть файл, скачай его

# А дальше уже слияние, не запускаем пока

In [14]:
# check1 = Flickr30kImagesDataset.from_pickle("/kaggle/working/Flickr30kImagesDatasetSave1")

In [15]:
# check2 = Flickr30kImagesDataset.from_pickle("/kaggle/working/Flickr30kImagesDatasetSave2")

In [16]:
# final = merge(check1, check2)

In [17]:
# final.to_pickle("Flickr30kImagesDatasetSaveFinal")