##### `Image Search Using Pinecone and ConvBase For Feature Extraction`

In [1]:
# Import Main Library
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
# import qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, Batch, Filter, MatchValue, FieldCondition 


# model
import timm
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

##### `Prepare Dataset`

In [2]:
pathes = [os.path.join('data-sample', image) for image in os.listdir('data-sample')]
df = pd.DataFrame({'path': pathes,
                   'id': np.arange(3054, 3054 + len(pathes), 1),
                   'class': ['class-a', 'class-b'] * int(len(pathes)/2)}
                  )

df.head()

Unnamed: 0,path,id,class
0,data-sample\00357563a7.jpg,3054,class-a
1,data-sample\003bd60fa9.jpg,3055,class-b
2,data-sample\01c6b7230c.jpg,3056,class-a
3,data-sample\024a037366.jpg,3057,class-b
4,data-sample\029c926ce9.jpg,3058,class-a


* `Load Envirnoment`

In [3]:
_ = load_dotenv(override=True)
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')

* `Model: VGG19 for Feature Extraction`

In [4]:
model = timm.create_model('vgg19', pretrained=True)
model = nn.Sequential(*list(model.children())[:-1])
_ = model.eval()

model

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

* `Extract Image Feature`

In [5]:
def extract_image_feature(image_paths: list):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    batch_fearures = []
    for image_path in image_paths:
        # Convert Image Path to Pillow
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0)
        
        # Pass The Image to Model to Extract Feature.
        with torch.no_grad():
            conv_feature = model(image)
            
            image_features = conv_feature.view(conv_feature.size(0), -1).tolist()[0]
        
        # append Feature
        batch_fearures.append(image_features)
    
    return batch_fearures
        

In [6]:
results = extract_image_feature(image_paths=['data-sample/0a73823599.jpg', 'data-sample/866a4779a7.jpg'])

vect_length = len(results[0])
print(f'Vectore Feature Length: {vect_length}')

Vectore Feature Length: 4096


* `Upserting to Pinecone`

In [9]:
# connect to Qdrant
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Collecting configration
collection_config = VectorParams(
    size=vect_length,
    distance=Distance.COSINE, 
    on_disk=True
)


## Create a collection
try:
    collec_name = 'Image-Search'
    client.create_collection(collection_name=collec_name, vectors_config=collection_config)
    print('Collection Created Successfuly')
except:
    print(f'Collection {collec_name} Already Exist.')

Collection Image-Search Already Exist.


In [10]:
df

Unnamed: 0,path,id,class
0,data-sample\00357563a7.jpg,3054,class-a
1,data-sample\003bd60fa9.jpg,3055,class-b
2,data-sample\01c6b7230c.jpg,3056,class-a
3,data-sample\024a037366.jpg,3057,class-b
4,data-sample\029c926ce9.jpg,3058,class-a
...,...,...,...
235,data-sample\9d21019336.jpg,3289,class-b
236,data-sample\9e020b77ac.jpg,3290,class-a
237,data-sample\9efd18dd6c.jpg,3291,class-b
238,data-sample\9f5fc65189.jpg,3292,class-a


In [11]:
## Function for upserting data to Qdrant
def upsert_to_qdrant(df, batch_size=32):

    ## A list for failed_ids
    failed_ids = []

    for batch_start in tqdm(range(0, len(df), batch_size)):

        try:
            ## Prepare batches
            batch_end = min(batch_start+batch_size, len(df))
            pathes_batch = df['path'][batch_start: batch_end].tolist()
            ids_batch = df['id'][batch_start: batch_end].tolist()     ## No need to be converted to string (Qdrant need integer)
            
            ## Payload
            payload_batch = [{'class': cls} for cls in df['class'][batch_start: batch_end].tolist()]

            ## Get Embeddings using HuggingFace model
            embeds_batch = extract_image_feature(image_paths=pathes_batch)

            ## Prepare to Qdrant
            to_upsert = Batch(ids=ids_batch, vectors=embeds_batch, payloads=payload_batch)

            ## Upsert to Qdrant
            _ = client.upsert(collection_name=collec_name, wait=True, points=to_upsert)


        except Exception as e:
            print(f'Error in upserting: {e}')
            failed_ids.append(ids_batch)

    return failed_ids


## Apply the function
failed_ids = upsert_to_qdrant(df=df, batch_size=32)

 12%|█▎        | 1/8 [00:16<01:55, 16.44s/it]

Error in upserting: The write operation timed out


100%|██████████| 8/8 [02:01<00:00, 15.21s/it]


In [12]:
## Check Status of Collection after upserting
collection_status = client.get_collection(collection_name=collec_name).status
collection_count_vectors = client.get_collection(collection_name=collec_name).points_count

print(f'Status is: {collection_status}')
print(f'Vectors Count is: {collection_count_vectors}')

Status is: green
Vectors Count is: 208


In [16]:
image_new_path = df['path'].iloc[-1]
image_feats_new = extract_image_feature(image_paths=[image_new_path])[0]

client.search(collection_name=collec_name, query_vector=image_feats_new, limit=10, 
              score_threshold=0.4, 
              query_filter=Filter(must=[FieldCondition(key='class', match=MatchValue(value='class-a'))]))

[ScoredPoint(id=3160, version=2, score=0.77637136, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3224, version=4, score=0.76000416, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3164, version=2, score=0.72962016, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3130, version=1, score=0.6861493, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3220, version=4, score=0.6848402, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3246, version=5, score=0.681923, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3222, version=4, score=0.6789307, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3166, version=2, score=0.6692039, payload={'class': 'class-a'}, vector=None, shard_key