In [1]:
import pandas as pd
df = pd.read_csv('../../top_rated_wines.csv')
df = df[df['variety'].notna()] # remove any NaN values as it blows up serialization
data = df.to_dict('records')
df

Unnamed: 0,name,region,variety,rating,notes
0,3 Rings Reserve Shiraz 2004,"Barossa Valley, Barossa, South Australia, Aust...",Red Wine,96.0,Vintage Comments : Classic Barossa vintage con...
1,Abreu Vineyards Cappella 2007,"Napa Valley, California",Red Wine,96.0,Cappella is a proprietary blend of two clones ...
2,Abreu Vineyards Cappella 2010,"Napa Valley, California",Red Wine,98.0,Cappella is one of the oldest vineyard sites i...
3,Abreu Vineyards Howell Mountain 2008,"Howell Mountain, Napa Valley, California",Red Wine,96.0,When David purchased this Howell Mountain prop...
4,Abreu Vineyards Howell Mountain 2009,"Howell Mountain, Napa Valley, California",Red Wine,98.0,"As a set of wines, it is hard to surpass the f..."
...,...,...,...,...,...
1360,Lewis Cellars Alec's Blend Red 2002,"Napa Valley, California",Red Wine,96.0,Number 12 on
1361,Lewis Cellars Cabernet Sauvignon 2002,"Napa Valley, California",Red Wine,96.0,Showcasing the unique personalities of small h...
1362,Lewis Cellars Cuvee L Cabernet Sauvignon 2015,"Napa Valley, California",Red Wine,96.0,"Straight from James Fenimore Cooper’s novel, L..."
1363,Lewis Cellars Reserve Cabernet Sauvignon 2010,"Napa Valley, California",Red Wine,96.0,


In [2]:
##imports the models submodule from the qdrant_client module. 
#This submodule likely contains model definitions used in the Qdrant Client library, such as definitions for records, vectors, collections, etc.
##imports the QdrantClient class from the qdrant_client module. 
#This class is the main interface for interacting with a Qdrant vector database.
##imports the SentenceTransformer class from the sentence_transformers module.
#The SentenceTransformer class is part of the sentence_transformers library, which is used for encoding text sentences into fixed-dimensional embeddings. 
#This class is commonly used to load pre-trained transformer models that can generate embeddings for input sentences.

from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [3]:
##initializes an instance of the SentenceTransformer class with a specific pre-trained model ccalled 'all-MiniLM-L6-v2'
##The instantiated SentenceTransformer object is assigned to the variable encoder. 
#This variable name (encoder) suggests that the object will be used to encode (or transform) sentences into fixed-dimensional embeddings.

encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [4]:
# create the vector database client
## instantiates the QdrantClient class. 
#The ":memory:" argument passed to the constructor specifies that the Qdrant database instance should be created in memory, rather than being stored on disk. This means that the database will exist only for the duration of the program's execution and will be destroyed once the program terminates.
## The instantiated QdrantClient object is assigned to the variable qdrant. 
#This variable name (qdrant) suggests that the object will be used to interact with the Qdrant vector database.
# the qdrant object can be used to perform various operations on the Qdrant vector database, such as uploading records, searching for nearest neighbors, or managing collections. 

qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [5]:
# Create collection to store books
## a new collection named "top_wines" will be created in the Qdrant vector database, configured to store book vectors of a specific dimensionality and using the cosine distance metric for similarity search.
## qdrant.recreate_collection(): This method is called on the qdrant object to recreate a collection in the Qdrant vector database. 
#Recreating a collection means deleting any existing collection with the same name and creating a new empty collection.
## the collection will be named "top_wines".
## vectors_config=models.VectorParams(...): This argument specifies the configuration for the vectors stored in the collection. 
#It contains parameters such as the size of the vectors and the distance metric used for similarity search.
## size=encoder.get_sentence_embedding_dimension(): This parameter specifies the size of the vectors to be stored in the collection. 
## The size is determined by calling the get_sentence_embedding_dimension() method on the encoder object, which is an instance of the SentenceTransformer class. 
# This method returns the dimensionality of the embeddings generated by the encoder object.
## distance=models.Distance.COSINE: This parameter specifies the distance metric to be used for similarity search in the collection. 
# In this case, the cosine distance metric is used, which measures the cosine of the angle between two vectors and is commonly used for comparing the similarity of vectors in high-dimensional spaces

qdrant.recreate_collection(
    collection_name="top_wines",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

True

In [6]:
###  this code snippet uploads records containing vector embeddings of book notes to the "top_wines" collection in the Qdrant vector database. Each record includes a unique identifier (id), the vector embedding of the book notes (vector), and additional metadata or information about the book (payload).
# Note that we use an older way of Qdrant doing the uploads using Records instead of Points
## qdrant.upload_records(): This method is called on the qdrant object to upload records to a collection in the Qdrant vector database.
##  the records will be uploaded to the "top_wines" collection.
## records=[...]: This argument specifies the records to be uploaded to the collection. 
# It is a list comprehension that generates a list of models.Record objects.
## models.Record(...): Each element of the list comprehension represents a 'models.Record' object, which contains the following attributes:
#    id: The unique identifier for the record, which is assigned the value of idx.
#    vector: The vector embedding of the book notes, generated by encoding the text using the encoder object (encoder.encode(doc["notes"])). It is converted to a list using .tolist().
#    payload: Additional data associated with the record, which is assigned the value of doc. 
# The doc variable likely contains metadata or information about the book.
## enumerate(data): This function iterates over each element in the data variable, providing both the index (idx) and the content (doc). 
# The data variable likely holds information about the books, such as their notes.

qdrant.upload_records(
    collection_name="top_wines",
    records=[
        models.Record(
            id=idx,
            vector=encoder.encode(doc["notes"]).tolist(),
            payload=doc
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

In [7]:
# Search time for awesome wines!
### This code snippet performs a search operation on the "top_wines" collection in the Qdrant vector database. It searches for nearest neighbors to the query vector, which is the embedding of the sentence "A wine from Mendoza Argentina" encoded using the encoder object.
## qdrant.search(): This method is called on the qdrant object to perform a search operation in the Qdrant vector database.
## collection_name="top_wines": the search will be conducted in the "top_wines" collection.
## query_vector=encoder.encode("A wine from Mendoza Argentina").tolist(): This argument specifies the query vector to search for nearest neighbors. It is generated by encoding the text "A wine from Mendoza Argentina" using the encoder object and converting the resulting embedding to a list.
## limit=3: This argument specifies the maximum number of search results to return. In this case, it limits the results to the top 3 nearest neighbors.
## After executing the search operation, the search results are stored in the hits variable. Each element of hits represents a search hit, containing information about the nearest neighbor and its similarity score.
## Iterating over search hits: This for loop iterates over each search hit in the hits variable.
## Printing search results: For each search hit, it prints the payload (additional data associated with the record) and the similarity score. hit.payload represents the additional data associated with the search hit, and hit.score represents the similarity score of the hit.

hits = qdrant.search(
    collection_name="top_wines",
    query_vector=encoder.encode("A wine from Mendoza Argentina").tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'name': 'Catena Zapata Nicasia Vineyard Malbec 2004', 'region': 'Argentina', 'variety': 'Red Wine', 'rating': 96.0, 'notes': '"The single-vineyard 2004 Malbec Nicasia Vineyard is located in the Altamira district of Mendoza. It was aged for 18 months in new French oak. Opaque purple-colored, it exhibits a complex perfume of pain grille, scorched earth, mineral, licorice, blueberry, and black cherry. Thick on the palate, bordering on opulent, it has layers of fruit, silky tannins, and a long, fruit-filled finish. It will age effortlessly for another 6-8 years and provide pleasure through 2025. When all is said and done, Catena Zapata is the Argentina winery of reference – the standard of excellence for comparing all others. The brilliant, forward-thinking Nicolas Catena remains in charge, with his daughter, Laura, playing an increasingly large role. The Catena Zapata winery is an essential destination for fans of both architecture and wine in Mendoza. It is hard to believe, given the su

In [8]:
hits = qdrant.search(
    collection_name="top_wines",
    query_vector=encoder.encode("A wine from Napa Valley, California").tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'name': 'Lewis Cellars Cabernet Sauvignon 2002', 'region': 'Napa Valley, California', 'variety': 'Red Wine', 'rating': 96.0, 'notes': 'Showcasing the unique personalities of small hillside vineyards from Pritchard Hill, Oakville and Rutherford, the 2002 Napa Valley Cabernet delivers compelling aromas of mocha, ripe berries, tobacco and sweet oak spice. The wine is 100% Cabernet Sauvignon, complex, rich and focused. With a deep core of black fruit and traces of briar and vanilla, it turns chocolaty and long on the palate with serious, integrated tannins.'} score: 0.6766242373750224
{'name': 'Bacio Divino Janzen Beckstoffer To Kalon Cabernet Sauvignon 2014', 'region': 'Napa Valley, California', 'variety': 'Red Wine', 'rating': 97.0, 'notes': "Andy Beckstoffer’s To Kalon Vineyard is considered perhaps the best vineyard in all of Napa Valley. Soils, aspect, drainage, meticulous growing practices- all have a part in the remarkability of this site. Wines from this vineyard are renowned for 