In [2]:
from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction,
)
import chromadb
from tqdm import tqdm
from datetime import datetime
import requests
import json
from typing import List, Union
import httpx
from tenacity import retry, stop_after_attempt, wait_exponential

class CustomOllamaEmbeddingFunction(OllamaEmbeddingFunction):
    def __init__(self, url: str = "http://localhost:11434", model_name: str = "nomic-embed-text:latest"):
        super().__init__(url=url, model_name=model_name)
        # Create a new session with increased timeout
        self._session = httpx.Client(timeout=180.0)  # 60 seconds timeout
    
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def _get_embedding(self, text: str) -> List[float]:
        """Get embedding for a single text with retry logic."""
        try:
            response = self._session.post(
                f"{self._api_url}/api/embeddings",
                json={"model": self._model_name, "prompt": text},
                timeout=180.0  # Explicit timeout for this request
            )
            response.raise_for_status()
            
            result = response.json()
            if 'embedding' in result:
                return result['embedding']
            else:
                raise ValueError(f"No embedding found in response: {result}")
                
        except httpx.TimeoutException as e:
            raise RuntimeError(f"Timeout while connecting to Ollama API. Please check if Ollama is running and responsive: {str(e)}")
        except httpx.RequestError as e:
            raise RuntimeError(f"Failed to connect to Ollama API. Please check if Ollama is running: {str(e)}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error while getting embedding: {str(e)}")

    def __call__(self, input: Union[str, List[str]]) -> List[List[float]]:
        texts = input if isinstance(input, list) else [input]
        embeddings = []
        
        for text in texts:
            try:
                embedding = self._get_embedding(text)
                embeddings.append(embedding)
            except Exception as e:
                raise RuntimeError(f"Failed to get embedding for text '{text[:50]}...': {str(e)}")
            
        return embeddings


class VectorStore:
    def __init__(self, model_name, collection_name, collection_description, path, hnsw_space = 'cosine', hnsw_construction = 100, hnsw_search_ef = 100, hnsw_M = 100):
        self.model = CustomOllamaEmbeddingFunction(
            url = "http://localhost:11434", 
            model_name = model_name
        )
        self.client = chromadb.PersistentClient(path=path)
        self.collection = self.client.get_or_create_collection(
            name = collection_name, 
            embedding_function = self.model, 
            metadata = {
                "description": collection_description, 
                "created": str(datetime.now()),
                "hnsw:space": hnsw_space,
                "hnsw:construction_ef": hnsw_construction,
                "hnsw:search_ef": hnsw_search_ef,
                "hnsw:M": hnsw_M
            }
        )    
    def add_data(self, ID: str, text: str):
        embedding = self.create_embedding(text)
        self.collection.add(
            ids = [ID],
            embeddings = [embedding],
            documents = [text]
        )
    def create_embedding(self, text: str):
        return self.model(text)[0]
    def query(self, embedding, search, n_results = 5):
        return self.collection.query(
            query_embeddings = [embedding],
            n_results = n_results
        )

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
df = pd.read_csv("train.csv")
df["Id"] = df["Id"].astype(str)

In [5]:
df

Unnamed: 0,Id,Key,Value
0,31719,NEW AVIATOR STD,AVIATOR DLX
1,32419,I20 MAGNA + VTVT M1 BS4,I20 (2018) ELITE MAGNA CVT
2,13202,SCORPIO S9,SCORPIO M HAWK S9 2WD 7 STR
3,110932,"SUPERB ELEGANCE (118 KW, TFSI)",OCTAVIA ELEGANCE 1.8 TSI AT 118 KW NEW
4,77876,575 DI S,COMMANDER 650 DI
...,...,...,...
75353,7604,FIGO 1.2 EXI,FIGO 1.2 DURATEC EXI
75354,72212,ZEN LX,ZEN LX
75355,103883,JUPITER ZX BSIV,JUPITER ZX
75356,994,ACTIVA,ACTIVA 3G


In [6]:
ids = np.array(df["Id"])
keys = np.array(df["Key"])
print(ids.shape)
print(type(ids[0]))

(75358,)
<class 'str'>


In [7]:
vector_store = VectorStore("nomic-embed-text:latest", "car_model", "Car Model Collection", "./vector-embeddings", 'cosine', 5000, 5000, 5000)

In [8]:
# vector_store.add_data("100000", "NEW AVIATOR STD")
# vector_store.add_data("100001", "SCORPIO S9")
# vector_store.add_data("100002", "I20 MAGNA + VTVT M1 BS4")
# vector_store.add_data("100003", "JUPITER ZX BSIV")

In [None]:
for i in  tqdm(range(ids.shape[0])):
    vector_store.add_data(ids[i], keys[i])

Add of existing embedding ID: 100000                         | 0/75358 [00:00<?, ?it/s]
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embedding ID: 100001
Add of existing embedding ID: 100002
Add of existing embedding ID: 100003
Add of existing embedding ID: 100000
Add of existing embeddin

In [138]:
embedding = vector_store.create_embedding("magna")

In [139]:
vector_store.query(embedding, 2)

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['100002', '100003', '100001', '100000']],
 'embeddings': None,
 'documents': [['I20 MAGNA + VTVT M1 BS4',
   'JUPITER ZX BSIV',
   'SCORPIO S9',
   'NEW AVIATOR STD']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None, None]],
 'distances': [[0.3590888136341398,
   0.5876936383394125,
   0.623756718990987,
   0.6999479441600363]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}