<a href="https://colab.research.google.com/github/Rohit-78958/Qdrant/blob/master/qdrant_semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U sentence-transformers



In [None]:
pip install -U qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.12.1-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant-client)
  Downloading grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)
  Downloading hyperframe-6.0.1-py3-none-any.whl.metadata (2.7 kB)
Collecting hpack<5,>=4.0 (fro

In [None]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [None]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
documents = [
    {
        "name": "The Time Machine",
        "description": "A man travels through time and witnesses the evolution of humanity.",
        "author": "H.G. Wells",
        "year": 1895,
    },
    {
        "name": "Ender's Game",
        "description": "A young boy is trained to become a military leader in a war against an alien race.",
        "author": "Orson Scott Card",
        "year": 1985,
    },
    {
        "name": "Brave New World",
        "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",
        "author": "Aldous Huxley",
        "year": 1932,
    },
    {
        "name": "The Hitchhiker's Guide to the Galaxy",
        "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",
        "author": "Douglas Adams",
        "year": 1979,
    },
    {
        "name": "Dune",
        "description": "A desert planet is the site of political intrigue and power struggles.",
        "author": "Frank Herbert",
        "year": 1965,
    },
    {
        "name": "Foundation",
        "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",
        "author": "Isaac Asimov",
        "year": 1951,
    },
    {
        "name": "Snow Crash",
        "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",
        "author": "Neal Stephenson",
        "year": 1992,
    },
    {
        "name": "Neuromancer",
        "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",
        "author": "William Gibson",
        "year": 1984,
    },
    {
        "name": "The War of the Worlds",
        "description": "A Martian invasion of Earth throws humanity into chaos.",
        "author": "H.G. Wells",
        "year": 1898,
    },
    {
        "name": "The Hunger Games",
        "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",
        "author": "Suzanne Collins",
        "year": 2008,
    },
    {
        "name": "The Andromeda Strain",
        "description": "A deadly virus from outer space threatens to wipe out humanity.",
        "author": "Michael Crichton",
        "year": 1969,
    },
    {
        "name": "The Left Hand of Darkness",
        "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",
        "author": "Ursula K. Le Guin",
        "year": 1969,
    },
    {
        "name": "The Three-Body Problem",
        "description": "Humans encounter an alien civilization that lives in a dying system.",
        "author": "Liu Cixin",
        "year": 2008,
    },
]

In [None]:
client = QdrantClient(":memory:")

In [None]:
client.create_collection(
    collection_name="my_books",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [None]:
client.upload_points(
    collection_name="my_books",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

In [None]:
hits = client.query_points(
    collection_name="my_books",
    query=encoder.encode("alien invasion").tolist(),
    limit=1,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'name': 'The War of the Worlds', 'description': 'A Martian invasion of Earth throws humanity into chaos.', 'author': 'H.G. Wells', 'year': 1898} score: 0.5700933395837129


In [None]:
hits = client.query_points(
    collection_name="my_books",
    query=encoder.encode("alien invasion").tolist(),
    query_filter=models.Filter(
        must=[models.FieldCondition(key="year", range=models.Range(gte=2000))]
    ),
    limit=2,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.4590293090137795
{'name': 'The Hunger Games', 'description': 'A dystopian society where teenagers are forced to fight to the death in a televised spectacle.', 'author': 'Suzanne Collins', 'year': 2008} score: 0.16074508691053407


**.............Neural Search Engine.....................**

In [None]:
!wget https://storage.googleapis.com/generall-shared-data/startups_demo.json

--2024-11-07 04:56:24--  https://storage.googleapis.com/generall-shared-data/startups_demo.json
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.202.207, 173.194.203.207, 74.125.199.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.202.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22205751 (21M) [application/json]
Saving to: ‘startups_demo.json’


2024-11-07 04:56:25 (23.3 MB/s) - ‘startups_demo.json’ saved [22205751/22205751]



In [None]:
pip install sentence-transformers numpy pandas tqdm




In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2"
)

In [None]:
df = pd.read_json("./startups_demo.json", lines=True)

In [None]:
vectors = model.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

Batches:   0%|          | 0/1265 [00:00<?, ?it/s]

In [None]:
vectors.shape
# > (40474, 384)

(40474, 384)

In [None]:
np.save("startup_vectors.npy", vectors, allow_pickle=False)

In [None]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance



In [None]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [None]:
fd = open("./startups_demo.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("./startup_vectors.npy")

In [None]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

In [None]:
hits = client.query_points(
    collection_name="startups",
    query=encoder.encode("fashion").tolist(),
    query_filter=None
    limit=3,
).points

#hits

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'name': 'Stylesight', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/39623-ebef07b8fadfed7813ea596025cb4bce-thumb_jpg.jpg?buster=1408289603', 'alt': 'Stylesight -  fashion', 'description': '', 'link': 'http://www.stylesight.com/', 'city': 'New York'} score: 0.7719707483194118
{'name': 'Shoptiques', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/79468-058a052c27ed13baf8e509dcd1178880-thumb_jpg.jpg?buster=1407426708', 'alt': 'Shoptiques -  fashion', 'description': '', 'link': 'http://www.shoptiques.com', 'city': 'New York'} score: 0.7548952769362056
{'name': 'Fashion85', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/349642-8543683c8155c15bb6b152efcb96c34c-thumb_jpg.jpg?buster=1393298465', 'alt': 'Fashion85 -  mobile fashion', 'description': 'Fashion Crowd-Opinion App\nEvery day, millions of women have several questions of style, either about the combination of looks, makeup, accessories or even questions related to purchasing products. Sometimes i

In [None]:
#must filter technique

from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()


        # Use `vector` for search for closest vectors in the collection
        search_result = client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="city",
            match=models.MatchValue(value="Chicago")
        )
    ]
),
# If you don't want any filters for now
            limit=5,  # 5 the most closest results is enough
        ).points
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        # payloads = [hit.payload for hit in search_result]
        # return payloads
        return search_result

In [None]:
#must not filter technique

from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()


        # Use `vector` for search for closest vectors in the collection
        search_result = client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter = models.Filter(
    must_not=[
        models.FieldCondition(
            key="city",
            match=models.MatchValue(value="Chicago")
        )
    ]
),
# If you don't want any filters for now
            limit=5,  # 5 the most closest results is enough
        ).points
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        # payloads = [hit.payload for hit in search_result]
        # return payloads
        return search_result

In [None]:
#should filter technique

from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()


        # Use `vector` for search for closest vectors in the collection
        search_result = client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter = models.Filter(
    should=[
        models.FieldCondition(
            key="city",
            match=models.MatchValue(value="Chicago")
        ),
        models.FieldCondition(
            key="city",
            match=models.MatchValue(value="New York")
        )
    ]
)
,
# If you don't want any filters for now
            limit=5,  # 5 the most closest results is enough
        ).points
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        # payloads = [hit.payload for hit in search_result]
        # return payloads
        return search_result

In [None]:
# Create a neural searcher instance
neural_searcher = NeuralSearcher(collection_name="startups")
search_results = neural_searcher.search(text="pandora style fashion in new yoirk")

# search_results
for hit in search_results:
    print(hit.payload, "score:", hit.score)

{'name': 'Gekks', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/597007-2cbe730f50e417d66e92fa339ff5ae7f-thumb_jpg.jpg?buster=1422897483', 'alt': 'Gekks -  retail fashion Designer footwear', 'description': 'Innovative no-show sock\nGekks are thin, breathable liner socks that grip to the inside of your boat shoes, loafers, drivers, and more. Our customers get to keep their sockless style by slipping in and out of their shoes barefoot, yet add comfort and anti-stench properties while inside ...', 'link': 'http://www.mygekks.com', 'city': 'Chicago'} score: 0.40567598066328314
{'name': 'MaBaker', 'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/706547-18cac9fa2ec094d004c07bb41eaa6a9e-thumb_jpg.jpg?buster=1432234428', 'alt': 'MaBaker -  mobile social media platforms art online dating', 'description': 'Your tattoo can chat to another\nNow your Tattoo can like and be liked by other Tattoos around it! TatChat’s aim is to attract and unite all people that bare body art, 

**FILTERINGS**

Collecting fastembed==0.3.6 (from qdrant-client[fastembed]>=1.8.2)
  Downloading fastembed-0.3.6-py3-none-any.whl.metadata (7.7 kB)
Collecting PyStemmer<3.0.0,>=2.2.0 (from fastembed==0.3.6->qdrant-client[fastembed]>=1.8.2)
  Downloading PyStemmer-2.2.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed==0.3.6->qdrant-client[fastembed]>=1.8.2)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting mmh3<5.0,>=4.0 (from fastembed==0.3.6->qdrant-client[fastembed]>=1.8.2)
  Downloading mmh3-4.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting onnx<2.0.0,>=1.15.0 (from fastembed==0.3.6->qdrant-client[fastembed]>=1.8.2)
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime<2.0.0,>=1.17.0 (from fastembed==0.3.6->qdr