In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import pandas as pd
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange





In [None]:


!curl -O https://storage.googleapis.com/generall-shared-data/startups_demo.json


In [15]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2", device="cpu"
)  # or device="cpu" if you don't have a GPU

In [16]:
df = pd.read_json("./startups_demo.json", lines=True)

In [17]:
vectors = model.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

Batches: 100%|██████████| 1265/1265 [14:42<00:00,  1.43it/s]


In [19]:
vectors.shape
# > (40474, 384)

(40474, 384)

In [20]:
np.save("startup_vectors.npy", vectors, allow_pickle=False)

## Run Qdrant in Docker

In [None]:
## command lines
# docker pull qdrant/qdrant
# mkdir my-project
# cd my-project
# mkdir qdrant_storage
# docker run --name qdrant_instance -p 6333:6333 -v ..path/my-project/qdrant_storage:/qdrant/storage -d qdrant/qdrant


In [21]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient("http://localhost:6333")

In [22]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [33]:
fd = open("./startups_demo.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("./startup_vectors.npy")

In [34]:
payload_list = list(payload)
for i, data in enumerate(payload_list[:5]):  # Change the slice to see more items
    print(f"Item {i}: {data}")

Item 0: {'name': 'SaferCodes', 'images': 'https://safer.codes/img/brand/logo-icon.png', 'alt': 'SaferCodes Logo QR codes generator system forms for COVID-19', 'description': 'QR codes systems for COVID-19.\nSimple tools for bars, restaurants, offices, and other small proximity businesses.', 'link': 'https://safer.codes', 'city': 'Chicago'}
Item 1: {'name': 'Human Practice', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/373036-94d1e190f12f2c919c3566ecaecbda68-thumb_jpg.jpg?buster=1396498835', 'alt': 'Human Practice -  health care information technology', 'description': 'Point-of-care word of mouth\nPreferral is a mobile platform that channels physicians’ interest in networking with their peers to build referrals within a hospital system.\nHospitals are in a race to employ physicians, even though they lose billions each year ($40B in 2014) on employment. Why ...', 'link': 'http://humanpractice.com', 'city': 'Chicago'}
Item 2: {'name': 'StyleSeek', 'images': 'https://d1qb2nb

In [25]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

## Build the search API

In [47]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from qdrant_client.models import Filter
    
city_of_interest = "Berlin"
# Define a filter for cities
city_filter = Filter(**{
    "must": [{
        "key": "city", # Store city information in a field of the same name 
        "match": { # This condition checks if payload field has the requested value
            "value": city_of_interest
        }
    }]
})

class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        # initialize Qdrant client
        self.qdrant_client = QdrantClient("http://localhost:6333")
    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            #query_filter=None,  # If you don't want any filters for now
            query_filter=city_filter,# TEST WITH AND WITHOUT FILTER
            limit=5,  # 5 the most closest results is enough
        )
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads
    
        