In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.insert(0, os.path.abspath('..'))

In [6]:
from argparse import ArgumentParser
from functools import partial
from multiprocessing.pool import ThreadPool
from pathlib import Path
from urllib.request import urlretrieve
import requests

import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from time import perf_counter

from db_client import Searcher

In [3]:
DOWNLOAD_URL = "https://unsplash-datasets.s3.amazonaws.com/lite/latest/unsplash-research-dataset-lite-latest.zip"
DATASET_PATH = Path(os.path.abspath('.')) / "unsplash-dataset"
DOWNLOADED_PHOTOS_PATH = DATASET_PATH / "photos"
DOWNLOADED_PHOTOS_PATH.mkdir(parents=True, exist_ok=True)

IMAGE_WIDTH = 480
THREADS_COUNT = 32

CLIP_URL = "http://127.0.0.1:8000"

### Download photo metadata

In [10]:
zip_filename = "unsplash-dataset.zip"
print(f"Downloading metadata file {zip_filename}...")
os.system(f"curl -o {zip_filename} {DOWNLOAD_URL}")
print(f"Extracting {zip_filename}...")
os.system(f"unzip {zip_filename} -d {str(DATASET_PATH)}")

Downloading metadata file unsplash-dataset.zip...


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  471M  100  471M    0     0  6632k      0  0:01:12  0:01:12 --:--:-- 7364k


Extracting unsplash-dataset.zip...
Archive:  unsplash-dataset.zip
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/collections.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/__MACOSX/._collections.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/colors.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/__MACOSX/._colors.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/conversions.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/__MACOSX/._conversions.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/DOCS.md  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/keywords.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset2/__MACOSX/._keywords.tsv000  
  inflating: /home/tobijes/dev/poc_opensearch/datasets/unsplash-dataset

0

### Download actual photos

In [4]:
exceptions = []
def download_photo(image_width, photo):
    photo_id = photo[0]
    photo_url = photo[1] + f"?w={image_width}"
    photo_path = DOWNLOADED_PHOTOS_PATH / f"{photo_id}.jpg"
    if not photo_path.exists():
        try:
            urlretrieve(photo_url, photo_path)
        except Exception as e:
            exceptions.append(f"Cannot download {photo_url}: {e}")

In [5]:
df = pd.read_csv(DATASET_PATH / "photos.tsv000", sep="\t", usecols=["photo_id", "photo_image_url"])
photos = df.values.tolist()

print("Photo downloading begins...")
with ThreadPool(THREADS_COUNT) as pool:
    exceptions = []
    # An enclosing list() statement waits for the iterator to end
    list(tqdm(pool.imap(partial(download_photo, IMAGE_WIDTH), photos), total=len(photos)))
print("Photo downloading finished!")
print("Exceptions:")
for exception in exceptions:
    print(exception)

Photo downloading begins...


 21%|██▏       | 5341/25000 [00:55<03:31, 92.89it/s]  

Cannot download https://images.unsplash.com-grass-sun.jpg?w=480: <urlopen error [Errno -2] Name or service not known>


 39%|███▉      | 9788/25000 [03:18<10:34, 23.96it/s] 

Cannot download https://images.unsplash.com/photo-1556015174-ac6f87f53456?w=480: HTTP Error 500: Internal Server Error


 56%|█████▌    | 13959/25000 [05:45<06:36, 27.84it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1668883188879-3a7acd2bec58?w=480: HTTP Error 404: Not Found


 58%|█████▊    | 14416/25000 [06:01<05:51, 30.10it/s]

Cannot download https://images.unsplash.com/photo-1454313882001-e79de1a9b00d?w=480: HTTP Error 500: Internal Server Error


 60%|█████▉    | 14960/25000 [06:19<05:25, 30.82it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1675446536649-e0d90add63bb?w=480: HTTP Error 404: Not Found


 60%|█████▉    | 14985/25000 [06:20<04:44, 35.20it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1675826725982-e8508781c558?w=480: HTTP Error 404: Not Found


 69%|██████▊   | 17152/25000 [07:35<03:35, 36.41it/s]  

Cannot download https://images.unsplash.com_TheBeach.jpg?w=480: <urlopen error [Errno -2] Name or service not known>


 71%|███████   | 17713/25000 [07:55<07:46, 15.61it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1696839602315-4bf9599635f2?w=480: HTTP Error 404: Not Found


 71%|███████▏  | 17871/25000 [08:00<03:08, 37.73it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1664547606960-008623079291?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1669842336797-4dfc03afa025?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1678382343928-825f71ac5856?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1675826638532-09d5c136dd7a?w=480: HTTP Error 404: Not Found


 72%|███████▏  | 17906/25000 [08:01<04:10, 28.28it/s]

Cannot download https://images.unsplash.com/photo-1547595175-1abe606baafd?w=480: HTTP Error 500: Internal Server Error


 73%|███████▎  | 18257/25000 [08:13<03:55, 28.67it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1676660359441-c620089f798a?w=480: HTTP Error 404: Not Found


 73%|███████▎  | 18299/25000 [08:14<03:23, 32.99it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1695635230516-e69891d27488?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1700984292461-fa2d83c28c6b?w=480: HTTP Error 404: Not Found


 74%|███████▎  | 18378/25000 [08:17<03:45, 29.31it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1700391373098-cd9acd1b7e7c?w=480: HTTP Error 404: Not Found


 89%|████████▉ | 22357/25000 [10:47<01:16, 34.45it/s]  

Cannot download https://images.unsplash.company?w=480: <urlopen error [Errno -2] Name or service not known>


 92%|█████████▏| 22964/25000 [11:09<01:18, 25.89it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1698420085881-1b840c138a47?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1695219820032-34cfa7950b09?w=480: HTTP Error 404: Not Found


 93%|█████████▎| 23319/25000 [11:20<00:43, 38.62it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1677636665443-2276041868e7?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1700567963303-1b83673c52a4?w=480: HTTP Error 404: Not Found


 96%|█████████▋| 24087/25000 [11:47<00:29, 30.77it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1676667573119-40081df5d920?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1673980029859-9e5ddaee1577?w=480: HTTP Error 404: Not Found


 96%|█████████▋| 24102/25000 [11:50<01:13, 12.27it/s]

Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1701192455496-a095e6e5f792?w=480: HTTP Error 404: Not Found
Cannot download https://images.unsplash.com/unsplash-premium-photos-production/premium_photo-1690325099431-1e536ba176bf?w=480: HTTP Error 404: Not Found


100%|██████████| 25000/25000 [12:30<00:00, 33.33it/s]

Photo downloading finished!





### Ingest photos

In [4]:
class UnsplashDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.photo_files = list(DOWNLOADED_PHOTOS_PATH.glob("*.jpg"))
        df = pd.read_csv(DATASET_PATH / "photos.tsv000", sep="\t", usecols=["photo_id", "photo_image_url"])
        self.id_to_url = {photo_id: photo_image_url for photo_id, photo_image_url in df.values.tolist()}

    def __len__(self):
        return len(self.photo_files)

    def __getitem__(self, idx):
        photo_file = self.photo_files[idx]
        photo_id = photo_file.name.split(".")[0]
        # image = utils.pil_loader(photo_file)
        photo_image_url = self.id_to_url[photo_id]
        return photo_id, photo_image_url, photo_file


def collate(batch):
    return zip(*batch)

def get_features(photo_files):
    photo_files = map(str, photo_files)
    multiple_files = [
        ("images", (photo_file, open(photo_file, "rb"), "image/jpg")) for photo_file in photo_files
    ]
    r = requests.post(CLIP_URL + "/images", files=multiple_files, verify=None)
    data = r.json()
    return data["feature_vectors"]

def generate_data():
    dataset = UnsplashDataset()
    dataloader = DataLoader(dataset, batch_size=64, shuffle=False, collate_fn=collate)

    for batch in tqdm(dataloader):
        photo_ids, photo_image_urls, photo_files = batch
        image_features = get_features(photo_files)
        batch_size = len(photo_ids)
        for i in range(batch_size):
            yield {
                "_index": "image",
                "_id": photo_ids[i],
                "url": photo_image_urls[i],
                "feature_vector": image_features[i],
            }

In [9]:
searcher = Searcher("image")

print("Creating an index...")
searcher.create_index()

print("Indexing images...")
start = perf_counter()
searcher.bulk_ingest(generate_data(), chunk_size=128)
end = perf_counter()

print(f"Ingestion took {end-start}s")



Creating an index...
Indexing images...


100%|██████████| 391/391 [07:19<00:00,  1.12s/it]

Ingestion took 440.02710314799697s



