### qdrant docker contianer with python client

In [None]:
from qdrant_client import QdrantClient

# Connect to Qdrant on localhost:6333
client = QdrantClient(host="localhost", port=6333)

# Check or create a collection
collection_name = "semantic_collection"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        "size": 768,  # Example embedding dimension
        "distance": "Cosine"
    }
)
print(f"Collection '{collection_name}' is ready!")


### details_df loading parquet file

In [None]:
from datasets import load_dataset

# Load the Parquet file from Hugging Face
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"
    },
    split="train"
)

# Convert to a Pandas DataFrame
df_details = dataset.to_pandas()
df_details.head()


In [None]:
df_details.columns

### heavy text columns are, title, overview, genres_str

In [None]:
#combining title,overview,genres_str into a single text

first_title = df_details.loc[0, "title"]
first_overview = df_details.loc[0, "overview"]
first_genres = df_details.loc[0, "genres_str"]

combined_text = f"{first_title}. {first_overview}. Genres: {first_genres}"
print(combined_text)

### removing superfluous characters from text columns

In [None]:
import re

for i in range(5):
    title = df_details.loc[i, "title"]
    overview = df_details.loc[i, "overview"]
    genres_raw = df_details.loc[i, "genres_str"]

    # 1) Remove surrounding brackets [ ]
    cleaned = genres_raw.strip("[]")
    # 2) Remove any single or double quotes
    cleaned = cleaned.replace("'", "").replace('"', "")
    # 3) Split on whitespace to separate each genre
    genres_list = re.split(r"\s+", cleaned.strip())

    # Join them with commas and a space
    genres_str = ", ".join(genres_list)

    combined_text = (
        f"Movie name is {title}. "
        f"Its description is {overview}. "
        f"Movie genres are {genres_str}"
    )

    print(f"Sample {i}:\n{combined_text}\n")

### Chunking Overview text

In [None]:
import torch
from sentence_transformers import SentenceTransformer

# 1) Define a function to chunk text by words
def chunk_text(overview: str, chunk_size: int = 30):
    """
    Splits the 'overview' into chunks of 'chunk_size' words each.
    Returns a list of text chunks.
    """
    words = overview.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        # Grab a slice of the word list
        chunk_words = words[i : i + chunk_size]
        # Rejoin them into a string
        chunk_str = " ".join(chunk_words)
        chunks.append(chunk_str)
    return chunks

# 2) Define a function to embed a list of text chunks
def embed_chunks(chunks, model):
    """
    Takes a list of text 'chunks' and a SentenceTransformer 'model'.
    Returns a list of vector embeddings (one per chunk).
    """
    embeddings = model.encode(chunks)
    return embeddings

# -----------------------------
# Example usage for the first 5 rows
# Assuming df_details is your DataFrame with 'overview' column
# -----------------------------

# Load a small SentenceTransformer model (for demonstration)
model = SentenceTransformer("all-MiniLM-L6-v2", 
                            device="mps" if torch.backends.mps.is_available() else "cpu")

print("Simulating chunking & embedding for the first 5 movies...\n")

for i in range(5):
    title = df_details.loc[i, "title"]
    overview = df_details.loc[i, "overview"]

    # 1) Chunk the text (let's say ~30 words per chunk)
    text_chunks = chunk_text(overview, chunk_size=50)

    # 2) Embed the chunks
    chunk_embeddings = embed_chunks(text_chunks, model)

    print(f"Movie: {title}")
    print(f"Number of chunks: {len(text_chunks)}")

    # Show the chunk texts (shortened for demo)
    for idx, chunk in enumerate(text_chunks):
        print(f"  Chunk {idx}: {chunk}")  # print first ~60 chars
    print("-" * 60)


## new df, combiantion_text 

In [None]:
import re
import pandas as pd

def clean_genres(raw_genres: str) -> str:
    """
    Removes brackets, quotes, and splits on whitespace to produce
    a comma-separated string of genres.
    Example: "['Animation' 'Comedy' 'Family']" -> "Animation, Comedy, Family"
    """
    cleaned = raw_genres.strip("[]").replace("'", "").replace('"', "")
    genres_list = re.split(r"\s+", cleaned.strip())
    genres_str = ", ".join(genres_list)
    return genres_str

def build_combined_text(row):
    """
    Builds a descriptive string using title, overview, and cleaned genres.
    """
    title = row["title"]
    overview = row["overview"]
    genres_str = clean_genres(row["genres_str"])
    
    return (
        f"Movie name is {title}. "
        f"Its description is {overview}. "
        f"Movie genres are {genres_str}"
    )

# Create a new DataFrame (df_combined) with 'movie_id' and 'combined_text'
df_combined = pd.DataFrame()
df_combined["movie_id"] = df_details["movie_id"]
df_combined["combined_text"] = df_details.apply(build_combined_text, axis=1)

# Inspect the first few rows
print(df_combined.head())


In [None]:
df_combined.shape

### Chunking approach, combined_df

In [None]:
import pandas as pd

def chunk_text(text: str, chunk_size: int = 50):
    """
    Splits the 'text' into segments of 'chunk_size' words each.
    Returns a list of text chunks.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk_words = words[i : i + chunk_size]
        chunk_str = " ".join(chunk_words)
        chunks.append(chunk_str)
    return chunks

# Create a new list of rows, each with (movie_id, chunk_index, chunk_text)
all_chunked_rows = []
for index, row in df_combined.iterrows():
    movie_id = row["movie_id"]
    combined_text = row["combined_text"]
    
    # Split into chunks (e.g., 50 words each)
    chunks = chunk_text(combined_text, chunk_size=50)
    
    for chunk_idx, chunk_str in enumerate(chunks):
        all_chunked_rows.append({
            "movie_id": movie_id,
            "chunk_index": chunk_idx,
            "chunk_text": chunk_str
        })

# Build a new DataFrame from these chunked rows
df_chunked = pd.DataFrame(all_chunked_rows, columns=["movie_id", "chunk_index", "chunk_text"])

print(df_chunked.head(10))


In [None]:
df_chunked.shape

### Embedding and Storing

In [11]:
import ray
import torch
import numpy as np
from sentence_transformers import SentenceTransformer

In [12]:
# Shut down any existing Ray session
ray.shutdown()

# Start Ray (optionally with a dashboard)
ray.init(include_dashboard=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0,1
Python version:,3.10.4
Ray version:,2.40.0
Dashboard:,http://127.0.0.1:8265


In [None]:
import ray
import torch
from transformers import AutoTokenizer, AutoModel

# 1) Initialize Ray (if you haven't already)
# If you already did ray.init(), you can skip it or wrap it in a conditional.
ray.init(include_dashboard=True)  # or simply ray.init()

# 2) Choose a model that matches your Qdrant vector_size (768 or 1536, etc.)
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()  # not strictly required, but good practice

@ray.remote
def embed_text(text: str):
    """
    Ray task: Takes a string and returns a 768-dim embedding as a Python list of floats.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pool over the sequence length
    last_hidden_state = outputs.last_hidden_state.squeeze(0)
    embedding = last_hidden_state.mean(dim=0)

    # Return as a list of floats (Qdrant-friendly)
    return embedding.cpu().tolist()


In [None]:
print("df_chunked shape:", df_chunked.shape)
print(df_chunked.head())
print(df_chunked.columns)

print("Step 3: Inspected df_chunked.")


In [None]:
def chunk_indices(total_len, chunk_size):
    all_idx = list(range(total_len))
    for i in range(0, total_len, chunk_size):
        yield all_idx[i : i + chunk_size]

def chunk_data(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i : i + chunk_size]

batch_size = 5000

all_idx_batches_list = list(chunk_indices(len(df_chunked), batch_size))
text_list = df_chunked["chunk_text"].tolist()
text_batches_list = list(chunk_data(text_list, batch_size))

print("Number of index batches:", len(all_idx_batches_list))
print("Number of text batches:", len(text_batches_list))


In [35]:
import torch
from sentence_transformers import SentenceTransformer

@ray.remote
class EmbedActor:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        device = "mps" if torch.backends.mps.is_available() else "cpu"
        print(f"Loading model on {device}")
        self.model = SentenceTransformer(model_name, device=device)

    def embed_texts(self, texts):
        # texts is a list of strings
        embeddings = self.model.encode(texts, convert_to_tensor=True)
        return embeddings.cpu().numpy()  # return NumPy array


In [None]:
# Create the single actor
embed_actor = EmbedActor.remote()

# Submit tasks for each batch in 'text_batches_list'
futures = []
for batch_i, text_batch in enumerate(text_batches_list):
    fut = embed_actor.embed_texts.remote(text_batch)
    futures.append(fut)

print("Number of futures (embedding tasks):", len(futures))


In [None]:
results = ray.get(futures)
print("Number of embedding batches:", len(results))


for i, emb_array in enumerate(results[:3]):
    print(f"Batch {i} shape:", emb_array.shape)


In [None]:
# Connect to Qdrant
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "semantic_retrieval_basic"
dim = 384  # Make sure this matches your actual embedding size

client.recreate_collection(
    collection_name=collection_name,
    vectors_config={"size": dim, "distance": "Cosine"}
)

# Insert code that matches 'all_idx_batches_list' with 'results' 
# to build points and call client.upsert(...)


In [41]:
from qdrant_client import QdrantClient

def test_search_and_retrieve(query_text, model, client, limit=5):
    """
    Embeds 'query_text' using the provided 'model',
    then searches the 'semantic_retrieval_basic' collection in Qdrant.
    Prints the top 'limit' results with their scores and payload fields.
    """

    # 1) Embed the query
    query_embedding = model.encode([query_text])[0].tolist()

    # 2) Perform a similarity search in Qdrant
    results = client.search(
        collection_name="semantic_retrieval_basic",
        query_vector=query_embedding,
        limit=limit
    )

    # 3) Print results
    print(f"Query: '{query_text}'")
    for i, hit in enumerate(results):
        print(f"\nHit {i}:")
        print("  Score:", hit.score)
        payload = hit.payload
        print("  movie_id:", payload.get("movie_id"))
        print("  chunk_index:", payload.get("chunk_index"))
        # Show a snippet of the text if it's large
        text_snippet = payload.get("chunk_text", "")[:100]
        print("  chunk_text:", text_snippet + ("..." if len(text_snippet) == 100 else ""))

    return results

# Example usage:
# --------------------------------
# client = QdrantClient(host="localhost", port=6333)
# query = "A historical drama about gladiators"
# model = ...  # Your loaded SentenceTransformer
# test_search_and_retrieve(query, model, client, limit=3)


In [43]:
from sentence_transformers import SentenceTransformer
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)


In [None]:
query = "A historical drama about gladiators"
test_search_and_retrieve(query, model, client, limit=3)

In [None]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
import torch

# 1) Initialize or reuse your Qdrant client
client = QdrantClient(host="localhost", port=6333)

# 2) Load a real SentenceTransformer model
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# 3) Define your test function (if you haven't already)
def test_search_and_retrieve(query_text, model, client, limit=5):
    query_embedding = model.encode([query_text])[0].tolist()
    results = client.search(
        collection_name="semantic_retrieval_basic",
        query_vector=query_embedding,
        limit=limit
    )
    print(f"Query: '{query_text}'")
    for i, hit in enumerate(results):
        print(f"\nHit {i}:")
        print("  Score:", hit.score)
        payload = hit.payload
        print("  movie_id:", payload.get("movie_id"))
        print("  chunk_index:", payload.get("chunk_index"))
        text_snippet = payload.get("chunk_text", "")[:100]
        print("  chunk_text:", text_snippet + ("..." if len(text_snippet) == 100 else ""))

# 4) Run a test query
query = "A historical drama about gladiators"
test_search_and_retrieve(query, model, client, limit=3)


In [None]:
info = client.get_collection(collection_name="semantic_retrieval_basic")
print(info)


In [None]:
result = client.upsert(collection_name="semantic_retrieval_basic", points=points, wait=True)
print("Upsert result:", result)


In [None]:
from qdrant_client.http.models import PointStruct

points = []
for row_in_batch, row_index in enumerate(idx_batch):
    # Convert embedding from NumPy array to a plain Python list
    vector = embedding_batch[row_in_batch].tolist()

    # Cast NumPy integers to Python int
    movie_id = int(df_chunked.loc[row_index, "movie_id"])
    chunk_index = int(df_chunked.loc[row_index, "chunk_index"])
    # If you have floats, do float(...) e.g. float(df_chunked.loc[row_index, "popularity"])

    # Build payload with only Python-native types (int, float, str)
    payload = {
        "movie_id": movie_id,
        "chunk_index": chunk_index,
        "chunk_text": df_chunked.loc[row_index, "chunk_text"]  # remains a Python string
    }

    # Also ensure the Qdrant 'id' is a plain int if you're using row_index or an auto counter
    point_id = int(row_index)

    points.append(
        PointStruct(
            id=point_id,
            vector=vector,
            payload=payload
        )
    )

# Finally, upsert all points at once
result = client.upsert(
    collection_name="semantic_retrieval_basic",
    points=points,
    wait=True
)
print("Upsert result:", result)


In [None]:
sub_batch_size = 500
for i in range(0, len(points), sub_batch_size):
    mini_batch = points[i : i + sub_batch_size]
    result = client.upsert(
        collection_name="semantic_retrieval_basic",
        points=mini_batch,
        wait=True
    )
    print(f"Upsert sub-batch [{i}:{i+sub_batch_size}] ->", result)


In [50]:
from qdrant_client import QdrantClient

client = QdrantClient(
    host="localhost",
    port=6333,
    timeout=600.0  # 10 minutes, for example
)


In [None]:
import numpy as np

# Convert df_chunked["embedding"] into a 2D NumPy array
# Each row i corresponds to df_chunked.iloc[i]
emb_matrix = np.vstack(df_chunked["embedding"].values)

print("Embedding matrix shape:", emb_matrix.shape)
# e.g. (num_chunks, 384)


In [None]:
# vector_db_setup.py

from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

def setup_qdrant_collection(
    collection_name: str = "my_collection",
    vector_size: int = 768,
    distance: str = "Cosine"
):
    """
    Sets up (or recreates) a Qdrant collection for vector data.
    :param collection_name: Name of the Qdrant collection
    :param vector_size: Dimensionality of the vectors
    :param distance: Distance function ("Cosine", "Euclid", or "Dot")
    :return: A QdrantClient instance
    """

    # 1. Initialize the Qdrant client
    client = QdrantClient(url="http://localhost:6333")

    # 2. Create or recreate the collection (use recreate_collection
    #    to drop it if it already exists)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size,
            distance=distance
        )
    )

    print(f"Collection '{collection_name}' created (or reset) successfully!")
    return client

# Example usage
if __name__ == "__main__":
    # Run the setup function to ensure collection is ready
    qdrant_client = setup_qdrant_collection()


In [None]:
# embedding_generation.py

import torch
from transformers import AutoTokenizer, AutoModel

# Attempt to use the MPS device if available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# For demonstration, we'll use a well-known sentence transformer model
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

def generate_embedding(text: str):
    """
    Generates an embedding vector for the given text using MPS (if available).
    """
    # 1. Tokenize the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    # 2. Move tensors to the device (MPS or CPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 3. Get the model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 4. outputs.last_hidden_state shape: [batch_size, seq_len, hidden_dim]
    #    We'll do a simple "mean pooling" across the sequence dimension:
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # shape: [seq_len, hidden_dim]
    embedding = last_hidden_state.mean(dim=0)                 # shape: [hidden_dim]

    # 5. Return the embedding as a CPU-based Python list (easier to pass to Qdrant)
    return embedding.cpu().numpy().tolist()

if __name__ == "__main__":
    sample_text = "This is an example sentence to generate an embedding."
    emb = generate_embedding(sample_text)
    print(f"Embedding length: {len(emb)}")
    print(f"Sample embedding snippet: {emb[:5]}")  # print first 5 values


In [None]:
# upsert_embeddings.py

# import uuid
# from qdrant_client import QdrantClient
# from qdrant_client.http.models import PointStruct
# from vector_db_setup import setup_qdrant_collection
# from embedding_generation import generate_embedding

def upsert_document(qdrant_client: QdrantClient, text: str, doc_id: str = None):
    """
    Upserts (inserts or updates) a single document's embedding into Qdrant.
    :param qdrant_client: An initialized QdrantClient.
    :param text: The raw text we want to embed and store.
    :param doc_id: Optional unique ID. If None, a UUID will be generated.
    """
    if doc_id is None:
        doc_id = str(uuid.uuid4())
    
    # 1. Generate the embedding
    embedding = generate_embedding(text)

    # 2. Create a PointStruct payload with both the vector and the original text
    point = PointStruct(
        id=doc_id,
        vector=embedding,
        payload={"text": text}  # Add any metadata you want here
    )

    # 3. Upsert into Qdrant
    qdrant_client.upsert(
        collection_name="my_collection",
        points=[point]
    )
    print(f"Document upserted with ID: {doc_id}")

if __name__ == "__main__":
    # Step A: Ensure the collection is set up (recreates if it doesn't exist)
    qdrant_client = setup_qdrant_collection(collection_name="my_collection")

    # Step B: Example usage: upsert a single piece of text
    sample_text = "Hello world, this is a sample text for Qdrant embedding."
    upsert_document(qdrant_client, sample_text)
