##### `Image Search Using Pinecone and ConvBase For Feature Extraction`

In [1]:
# Import Main Library
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
# import pinecone
from pinecone import Pinecone, ServerlessSpec

# model
import timm
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

##### `Prepare Dataset`

In [2]:
pathes = [os.path.join('data-sample', image) for image in os.listdir('data-sample')]
df = pd.DataFrame({'path': pathes,
                   'id': np.arange(3054, 3054 + len(pathes), 1),
                   'class': ['class-a', 'class-b'] * int(len(pathes)/2)}
                  )

df.head()

Unnamed: 0,path,id,class
0,data-sample\00357563a7.jpg,3054,class-a
1,data-sample\003bd60fa9.jpg,3055,class-b
2,data-sample\01c6b7230c.jpg,3056,class-a
3,data-sample\024a037366.jpg,3057,class-b
4,data-sample\029c926ce9.jpg,3058,class-a


* `Load Envirnoment`

In [3]:
_ = load_dotenv(override=True)
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

* `Model: VGG19 for Feature Extraction`

In [4]:
model = timm.create_model('vgg19', pretrained=True)
model = nn.Sequential(*list(model.children())[:-1])
_ = model.eval()

model

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

* `Extract Image Feature`

In [5]:
def extract_image_feature(image_paths: list):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    batch_fearures = []
    for image_path in image_paths:
        # Convert Image Path to Pillow
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0)
        
        # Pass The Image to Model to Extract Feature.
        with torch.no_grad():
            conv_feature = model(image)
            
            image_features = conv_feature.view(conv_feature.size(0), -1).tolist()[0]
        
        # append Feature
        batch_fearures.append(image_features)
    
    return batch_fearures
        

In [6]:
results = extract_image_feature(image_paths=['data-sample/0a73823599.jpg', 'data-sample/866a4779a7.jpg'])

vect_length = len(results[0])
print(f'Vectore Feature Length: {vect_length}')

Vectore Feature Length: 4096


* `Upserting to Pinecone`

In [7]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

try:
    print('Deleting all indexes')
    _ = [pinecone.delete_index(name=index_name['name']) for index_name in pinecone.list_indexes()]
except Exception as e:
    print('Error In Deleting Indexes: {}'.format(e))
    
    
index_name = 'image-search-live'
if index_name not in pinecone.list_indexes():
    print('Creating Index: {}'.format(index_name))
    pinecone.create_index(
        name=index_name,
        dimension=vect_length,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print('Done Creating Index: {}'.format(index_name))
    
    
index = pinecone.Index(index_name)
index

Deleting all indexes
Creating Index: image-search-live
Done Creating Index: image-search-live


<pinecone.data.index.Index at 0x1d5e566b2b0>

In [8]:
def upserting_to_pinecone(df_images, batch_size=32):
    faild_ids = []
    
    for batch_start in tqdm(range(0, len(df_images), batch_size)):
    
        try:
            batch_end = min(len(df_images), batch_start + batch_size)
            
            paths_batch = df_images['path'][batch_start:batch_end].tolist()
            ids_batch = df_images['id'][batch_start:batch_end].astype(str).tolist()
            metadata_classes = df_images['class'][batch_start:batch_end].tolist()
            
            # Call to Extract Image Feature.
            batch_extracted = extract_image_feature(image_paths=paths_batch)
            
            # Prepare Data To Upserting
            to_upsert = [(ids, features, {'class': cls}) for ids, features, cls in zip(ids_batch, batch_extracted, metadata_classes)]
            
            # To Upserting in Pinecone
            _ = index.upsert(vectors=to_upsert)
        
        except Exception as e:
            print(f'Faild upserting {e}')
            faild_ids.append(ids_batch)
    
    return faild_ids


## Apply the Upserting Function
faild_ids = upserting_to_pinecone(df_images=df, batch_size=32)

100%|██████████| 8/8 [01:57<00:00, 14.64s/it]


##### `Query In Pinecone`

In [10]:
# Get the similer pages

image_path_new = df['path'].iloc[-1]

image_feaures = extract_image_feature(image_paths=[image_path_new])[0]

# Query In Pinecone
result = index.query(vector=[image_feaures], top_k=5, include_metadata=True,)
result['matches']

[{'id': '3293',
  'metadata': {'class': 'class-b'},
  'score': 0.999999881,
  'values': []},
 {'id': '3160',
  'metadata': {'class': 'class-a'},
  'score': 0.776371419,
  'values': []},
 {'id': '3224',
  'metadata': {'class': 'class-a'},
  'score': 0.760004163,
  'values': []},
 {'id': '3164',
  'metadata': {'class': 'class-a'},
  'score': 0.729620099,
  'values': []},
 {'id': '3135',
  'metadata': {'class': 'class-b'},
  'score': 0.701402307,
  'values': []}]

##### `Deleting In Pinecone`

In [16]:
_ = index.delete(ids=['3327', '3152'])

In [17]:
index.describe_index_stats()

{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 239}},
 'total_vector_count': 239}

##### `Inserting In Pinecone`

In [26]:
# using Fetch

index.fetch(ids=['3293'])['vectors']['3293']['values'][:5]

[0.0, 0.0, 0.0, 0.0, 0.0]