In [None]:
%pip install -q qdrant-client
%pip install -q nltk

In [None]:
%pip install -q fastembed no-deps

In [None]:
%pip install scipy

Testing out qdrant for future RAG set-up

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

# Initialise Qdrant client
client = QdrantClient(":memory:")  # Use in-memory for demo; replace with actual endpoint for production

# Check if the collection already exists
collection_name = "demo_collection"
if not client.collection_exists(collection_name):
    # Create the collection if it doesn't exist
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=3, distance=Distance.EUCLID)
    )
else:
    print(f"Collection '{collection_name}' already exists.")

In [None]:
from qdrant_client.http.models import PointStruct

# Example vectors
vectors = [
    [1.0, 2.0, 3.0],
    [4.0, 5.0, 6.0],
    [7.0, 8.0, 9.0],
]

# Insert vectors with unique IDs
for idx, vector in enumerate(vectors):
    client.upsert(
        collection_name="demo_collection",
        points=[
            PointStruct(id=idx, vector=vector)  # Correct structure
        ]
    )


In [None]:
query_vector = [1.1, 2.1, 3.1]

# Perform the search
search_results = client.search(
    collection_name="demo_collection",
    query_vector=query_vector,
    limit=2  # Number of closest matches to return
)

# Display search results
for result in search_results:
    print(f"ID: {result.id}, Distance: {result.score}, Vector: {result.vector}")


In [None]:
collection_name = "my_embeddings_collection"

# Check if the collection already exists
if client.collection_exists(collection_name):
    # Optionally drop the existing collection if you want to recreate it
    client.delete_collection(collection_name)

# Create the collection
client.create_collection(
    collection_name=collection_name,
    vectors_config={"size": 768, "distance": "Cosine"}  # Adjust `size` to match your embedding dimension
)

In [None]:
with open("general_advice.txt", "r") as file:
    text_data = file.read()


In [None]:
from nltk.tokenize import sent_tokenize  # Example using NLTK

def chunk_text(text, max_tokens=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        current_chunk.append(sentence)
        if len(" ".join(current_chunk).split()) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

text_chunks = chunk_text(text_data)


In [None]:
from fastembed import FastEmbed 

model = FastEmbed(model_name="test_model")


In [None]:
embeddings = []

for chunk in text_chunks:
    embedding = model.embed_text(chunk)
    embeddings.append(embedding)


In [None]:
embeddings = model.embed_texts(text_chunks)

In [None]:
import numpy as np

embeddings = np.array(embeddings)  # Convert list of embeddings to a NumPy array


Cosine Similarity Test: Similar texts should have a high cosine similarity (close to 1), while dissimilar texts should have a low cosine similarity (close to 0).

In [None]:
from scipy.spatial.distance import cosine

def cosine_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1, embedding2)

similarity = cosine_similarity(embeddings[0], embeddings[1])
print(f"Cosine Similarity between first two chunks: {similarity}")


Nearest Neighbor Search: To further test the embeddings, nearest neighbor search within the embedding space will allow to see if similar text chunks are clustered together.

In [None]:
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors(n_neighbors=2, metric='cosine')
nn_model.fit(embeddings)

# Find the nearest neighbor for the first chunk
distances, indices = nn_model.kneighbors([embeddings[0]])

# Output the closest neighbor
print(f"Closest neighbor to the first chunk is chunk at index: {indices[0][1]} with distance: {distances[0][1]}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensionality for visualisation (e.g., from 768D to 2D)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot the reduced embeddings
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])

# Optionally, label the points with chunk indices or keywords
for i, text_chunk in enumerate(text_chunks):
    plt.annotate(str(i), (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

plt.title('PCA of Text Chunk Embeddings')
plt.show()

In [None]:
chunk1 = text_chunks[0]
chunk2 = text_chunks[1]
similarity = cosine_similarity(embeddings[0], embeddings[1])

print(f"Chunk 1: {chunk1}\n")
print(f"Chunk 2: {chunk2}\n")
print(f"Cosine Similarity: {similarity}")

Qdrant Vector Database

In [None]:
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=vectors_config
    )

In [None]:
client.update_collection(
    collection_name=collection_name,
    config={
        "hnsw_config": {
            "ef_construct": 200,  # Construction time/space trade-off
            "m": 16  # Higher value gives more accurate results but uses more memory
        }
    }

In [None]:
# Retrieve collection info to verify setup
collection_info = client.get_collection(collection_name)
print(f"Collection Info: {collection_info}")


In [None]:
import numpy as np

# Example vector for testing
test_vector = np.random.rand(768).tolist()  # Random 768-dimensional vector

# Insert the vector into the collection
client.upsert(
    collection_name=collection_name,
    points=[
        {
            "id": 1,  # Unique ID for this vector
            "vector": test_vector
        }
    ]
)

# Retrieve the vector
retrieved_vector = client.retrieve(
    collection_name=collection_name,
    ids=[1]
)
print(f"Retrieved Vector: {retrieved_vector}")


Load the Generated Vector Embeddings into the Qdrant Database

In [None]:
points = [
    {
        "id": idx,  # Unique ID for each vector
        "vector": embedding,  # The vector embedding
        "payload": {"text": text_chunk}  # metadata
    }
    for idx, (embedding, text_chunk) in enumerate(zip(embeddings, text_chunks))
]


In [None]:
client.upsert(
    collection_name=collection_name,
    points=points
)

In [None]:
# Retrieve vectors by ID to verify
retrieved_vectors = client.retrieve(
    collection_name=collection_name,
    ids=[0, 1, 2]  # Retrieve the first three vectors
)
print(f"Retrieved Vectors: {retrieved_vectors}")

Test the Retrieval Process

In [None]:
query_vector = embeddings[0]

# Perform a search in the collection
search_results = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=3  # Number of closest neighbors to return
)

# Output the search results
print("Search Results:")
for result in search_results:
    print(f"ID: {result['id']}, Score: {result['score']}, Text: {result['payload']['text']}")


In [None]:
import time

start_time = time.time()

# Perform the search
search_results = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=3
)

end_time = time.time()
retrieval_time = end_time - start_time

print(f"Retrieval Time: {retrieval_time:.4f} seconds")

In [None]:
# Compare the retrieved text with the query text
query_text = text_chunks[0]
print(f"Query Text: {query_text}\n")

for result in search_results:
    retrieved_text = result['payload']['text']
    print(f"Retrieved Text: {retrieved_text}\n")


In [None]:
# Test with multiple vectors
for i in range(5):  # Test with the first 5 vectors
    query_vector = embeddings[i]
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=3
    )
    print(f"\nSearch results for query {i}:")
    for result in search_results:
        print(f"ID: {result['id']}, Score: {result['score']}, Text: {result['payload']['text']}")


In [None]:
# Optimise search by adjusting the 'ef' parameter
search_results = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=3,
    params={"hnsw_ef": 200}  # Higher 'ef' values typically increase accuracy
)