In [1]:
from datasets import load_dataset

# Load the Parquet file from Hugging Face
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"
    },
    split="train"
)

# Convert to a Pandas DataFrame
df_details = dataset.to_pandas()
df_details.head()


Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [6]:
#combining title,overview,genres_str into a single text

first_title = df_details.loc[0, "title"]
first_overview = df_details.loc[0, "overview"]
first_genres = df_details.loc[0, "genres_str"]

combined_text = f"{first_title}. {first_overview}. Genres: {first_genres}"
print(combined_text)

Gladiator. In the year 180, the death of Emperor Marcus Aurelius throws the Roman Empire into chaos. Maximus is one of the Roman army's most capable and trusted generals, as well as a key advisor to the emperor. As Marcus' devious son Commodus ascends to the throne, Maximus is sentenced to execution. He escapes but is captured by slave traders. Renamed "Spaniard" and forced to become a gladiator, Maximus must battle to the death against other men for the amusement of paying audiences.. Genres: ['Action' 'Drama' 'Adventure']


In [7]:
df_details.columns

Index(['movie_id', 'title', 'overview', 'release_date', 'runtime',
       'original_language', 'popularity', 'genres_str'],
      dtype='object')

In [10]:
import pandas as pd

def chunk_text(overview: str, chunk_size: int = 50):
    """
    Splits the 'overview' text into chunks of 'chunk_size' words each.
    Returns a list of text chunks.
    """
    words = overview.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk_words = words[i : i + chunk_size]
        chunk_str = " ".join(chunk_words)
        chunks.append(chunk_str)
    return chunks

# Example DataFrame: df_details with 'movie_id', 'title', 'overview'
# We'll simulate chunking the first 5 rows for demonstration

# Suppose df_details has columns: 
#  - "movie_id" (int or string identifier)
#  - "title" (string)
#  - "overview" (string)

df_test = df_details.head(5).copy()  # just first 5 for example

for i in range(len(df_test)):
    movie_id = df_test.loc[i, "movie_id"]
    title = df_test.loc[i, "title"]
    overview = df_test.loc[i, "overview"]
    
    # Chunk the text (50 words per chunk by default)
    chunks = chunk_text(overview, chunk_size=50)
    
    print(f"Movie: {title} (ID: {movie_id})")
    print(f"Number of chunks: {len(chunks)}")
    for idx, chunk in enumerate(chunks):
        print(f"  Chunk {idx}: {chunk}")
    print("-" * 60)


Movie: Gladiator (ID: 98.0)
Number of chunks: 2
  Chunk 0: In the year 180, the death of Emperor Marcus Aurelius throws the Roman Empire into chaos. Maximus is one of the Roman army's most capable and trusted generals, as well as a key advisor to the emperor. As Marcus' devious son Commodus ascends to the throne, Maximus is sentenced to
  Chunk 1: execution. He escapes but is captured by slave traders. Renamed "Spaniard" and forced to become a gladiator, Maximus must battle to the death against other men for the amusement of paying audiences.
------------------------------------------------------------
Movie: How the Grinch Stole Christmas (ID: 8871.0)
Number of chunks: 1
  Chunk 0: The Grinch decides to rob Whoville of Christmas - but a dash of kindness from little Cindy Lou Who and her family may be enough to melt his heart...
------------------------------------------------------------
Movie: Chicken Run (ID: 7443.0)
Number of chunks: 2
  Chunk 0: The creators of Wallace & Gromit br

In [11]:
chunk_rows = []

for i, row in df_test.iterrows():
    movie_id = row["movie_id"]
    title = row["title"]
    overview = row["overview"]
    
    # Get the list of chunks for this movie
    chunks = chunk_text(overview, chunk_size=50)
    
    # Build a list of dicts, one per chunk
    for chunk_index, chunk_str in enumerate(chunks):
        chunk_rows.append({
            "movie_id": movie_id,
            "title": title,
            "chunk_index": chunk_index,
            "chunk_text": chunk_str
        })

# Create a new DataFrame that has separate rows for each chunk
df_chunked = pd.DataFrame(chunk_rows)
print(df_chunked.head(10))  # see first 10 rows
print(f"Total chunked rows: {len(df_chunked)}")


   movie_id                           title  chunk_index  \
0      98.0                       Gladiator            0   
1      98.0                       Gladiator            1   
2    8871.0  How the Grinch Stole Christmas            0   
3    7443.0                     Chicken Run            0   
4    7443.0                     Chicken Run            1   
5    9532.0               Final Destination            0   
6      77.0                         Memento            0   
7      77.0                         Memento            1   

                                          chunk_text  
0  In the year 180, the death of Emperor Marcus A...  
1  execution. He escapes but is captured by slave...  
2  The Grinch decides to rob Whoville of Christma...  
3  The creators of Wallace & Gromit bring you an ...  
4  bit o’ cluck, the fearless flock plots one las...  
5  After a teenager has a terrifying vision of hi...  
6  Leonard Shelby is tracking down the man who ra...  
7  remember what ha

In [14]:
df_chunked['chunk_text'].head(5)

0    In the year 180, the death of Emperor Marcus A...
1    execution. He escapes but is captured by slave...
2    The Grinch decides to rob Whoville of Christma...
3    The creators of Wallace & Gromit bring you an ...
4    bit o’ cluck, the fearless flock plots one las...
Name: chunk_text, dtype: object

In [17]:
import re
import pandas as pd

def clean_genres(raw_genres: str) -> str:
    """
    Removes brackets, quotes, and splits on whitespace to produce
    a comma-separated string of genres.
    Example: "['Animation' 'Comedy' 'Family']" -> "Animation, Comedy, Family"
    """
    # 1) Strip leading/trailing square brackets [ ], and remove ' or "
    cleaned = raw_genres.strip("[]").replace("'", "").replace('"', "")
    # 2) Split on any whitespace -> list of genres
    genres_list = re.split(r"\s+", cleaned.strip())
    # 3) Join them with ", " -> "Animation, Comedy, Family"
    genres_str = ", ".join(genres_list)
    return genres_str

def build_combined_text(row):
    """
    Builds a descriptive string using title, overview, and cleaned genres.
    Example outcome:
    "Movie name is <title>. Its description is <overview>. Movie genres are <genre_1, genre_2, ...>"
    """
    title = row["title"]
    overview = row["overview"]
    genres_str = clean_genres(row["genres_str"])
    
    return (
        f"Movie name is {title}. "
        f"Its description is {overview}. "
        f"Movie genres are {genres_str}"
    )

# Create a new DataFrame (df_combined) with 'movie_id' and 'combined_text'
df_combined = pd.DataFrame()
df_combined["movie_id"] = df_details["movie_id"]
df_combined["combined_text"] = df_details.apply(build_combined_text, axis=1)

# Inspect the first few rows
print(df_combined.head())


   movie_id                                      combined_text
0      98.0  Movie name is Gladiator. Its description is In...
1    8871.0  Movie name is How the Grinch Stole Christmas. ...
2    7443.0  Movie name is Chicken Run. Its description is ...
3    9532.0  Movie name is Final Destination. Its descripti...
4      77.0  Movie name is Memento. Its description is Leon...


In [18]:
df_combined.shape

(230586, 2)

In [19]:
def chunk_text(text, chunk_size=50):
    """
    Splits a text into chunks of 'chunk_size' words each.
    Returns a list of text chunks.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk_words = words[i : i + chunk_size]
        chunk_str = " ".join(chunk_words)
        chunks.append(chunk_str)
    return chunks

In [20]:
import pandas as pd

chunk_rows = []

for i, row in df_combined.iterrows():
    movie_id = row["movie_id"]
    combined = row["combined_text"]

    # Get the list of chunks for this movie
    chunks = chunk_text(combined, chunk_size=50)

    # Build a list of dicts, one per chunk
    for idx, chunk_str in enumerate(chunks):
        chunk_rows.append({
            "movie_id": movie_id,
            "chunk_index": idx,
            "chunk_text": chunk_str
        })

# Create the new DataFrame
df_chunked = pd.DataFrame(chunk_rows)

print(df_chunked.head(10))
print(f"df_chunked shape: {df_chunked.shape}")


   movie_id  chunk_index                                         chunk_text
0      98.0            0  Movie name is Gladiator. Its description is In...
1      98.0            1  to the throne, Maximus is sentenced to executi...
2    8871.0            0  Movie name is How the Grinch Stole Christmas. ...
3    7443.0            0  Movie name is Chicken Run. Its description is ...
4    7443.0            1  to fly…but, with teamwork, determination and a...
5    9532.0            0  Movie name is Final Destination. Its descripti...
6      77.0            0  Movie name is Memento. Its description is Leon...
7      77.0            1  of life before his accident, Leonard cannot re...
8    8358.0            0  Movie name is Cast Away. Its description is Ch...
9    8358.0            1  desolate island. With no way to escape, Chuck ...
df_chunked shape: (386526, 3)


In [22]:
#Qdrant Testing

from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

# Connect to Qdrant
qdrant_client = QdrantClient(url="http://localhost:6333")

# Create or reset a test collection
collection_name = "movie_collection_test"

qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance="Cosine")
)

print(f"Collection '{collection_name}' ready for 384-dim embeddings.")


Collection 'movie_collection_test' ready for 384-dim embeddings.


  qdrant_client.recreate_collection(


Ray Franework

In [23]:
import ray
import torch
from sentence_transformers import SentenceTransformer

In [24]:
# Shut down any existing Ray session
ray.shutdown()

# Start Ray (optionally with a dashboard)
ray.init(include_dashboard=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0,1
Python version:,3.10.4
Ray version:,2.40.0
Dashboard:,http://127.0.0.1:8265


In [25]:
device = torch.device("mps")

In [26]:
@ray.remote
class EmbeddingActor:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        # Load the SentenceTransformer on MPS
        self.model = SentenceTransformer(model_name, device="mps")

    def embed_texts(self, texts):
        # Return Python list of lists for Qdrant
        embeddings = self.model.encode(
            texts, 
            convert_to_numpy=True,  # -> NumPy array
            show_progress_bar=False
        )
        # Convert to list of floats per row
        return [emb.tolist() for emb in embeddings]

In [30]:
# Rows upsert to Qdrant

import uuid
from qdrant_client.http.models import PointStruct

def upsert_batch(df_subset, embed_actor, qdrant_client, collection_name):
    """
    Embeds df_subset['chunk_text'] via the Ray actor, then upserts each row into Qdrant.
    """
    # 1) Prepare texts
    text_list = df_subset["chunk_text"].tolist()

    # 2) Call the Ray actor to embed
    embedding_ref = embed_actor.embed_texts.remote(text_list)
    embeddings = ray.get(embedding_ref)

    # 3) Build Qdrant points
    points = []
    for i, row in df_subset.iterrows():
        # Instead of "98-0", we use a real UUID
        doc_id = str(uuid.uuid4())

        points.append(PointStruct(
            id=doc_id,                 # Must be a valid int or a UUID string
            vector=embeddings[i],
            payload={
                "movie_id": row["movie_id"],
                "chunk_index": row["chunk_index"],
                "chunk_text": row["chunk_text"]
            }
        ))

    # 4) Upsert into Qdrant
    qdrant_client.upsert(collection_name=collection_name, points=points)
    print(f"Upserted {len(df_subset)} rows into Qdrant collection '{collection_name}'.")


In [31]:
# Create the Ray actor
embed_actor = EmbeddingActor.remote()

# Take first 100 rows
df_test = df_chunked.head(100).copy()

# Upsert the test batch
upsert_batch(
    df_subset=df_test,
    embed_actor=embed_actor,
    qdrant_client=qdrant_client,
    collection_name=collection_name
)

Upserted 100 rows into Qdrant collection 'movie_collection_test'.


In [38]:
# Query Test
# Step 1: If using Ray actor, embed the query
test_query = "Man with super powers"
query_embedding_ref = embed_actor.embed_texts.remote([test_query])
query_embedding = ray.get(query_embedding_ref)[0]

# Step 2: Qdrant search
results = qdrant_client.search(
    collection_name="movie_collection_test",
    query_vector=query_embedding,
    limit=2
)

# Step 3: Display results
for i, r in enumerate(results):
    print(f"Rank {i+1}, Score: {r.score}")
    print(f"  ID: {r.id}")
    print(f"  Payload: {r.payload}")
    print()


Rank 1, Score: 0.3628593
  ID: 31229eb9-1ea9-4704-b3a2-52a61f80c568
  Payload: {'movie_id': 9383.0, 'chunk_index': 0, 'chunk_text': 'Movie name is Hollow Man. Its description is Cocky researcher Sebastian Caine is working on a project to make living creatures invisible. Determined to achieve the ultimate breakthrough, Caine pushes his team to move to the next phase — using himself as the subject. The test is a success, but'}

Rank 2, Score: 0.29293725
  ID: 26b468a9-f185-47f7-8c1c-9486ad6b14ae
  Payload: {'movie_id': 9383.0, 'chunk_index': 1, 'chunk_text': "when the process can't be reversed and Caine seems doomed to future without flesh, he starts to turn increasingly dangerous.. Movie genres are Action, Science, Fiction, Thriller"}



In [40]:
from collections import defaultdict

# 2) Example Query
####################################################################################

# Suppose we have some data upserted in Qdrant already (100 rows or more).
# Let's do a simple query:
query_text = "Movies with more action"

# Embed the query using the actor
query_embedding_ref = embed_actor.embed_texts.remote([query_text])
query_embedding = ray.get(query_embedding_ref)[0]  # single vector


# 3) Search Qdrant: chunk-level results
####################################################################################

# We retrieve top 50 chunk hits. Qdrant will return chunk-level results.
search_results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embedding,
    limit=50  # or some number
)


# Step A: Group all chunk hits by their movie_id
aggregated = defaultdict(list)

for r in search_results:
    # Each r is a ScoredPoint
    # We'll assume you stored "movie_id" in the payload
    movie_id = r.payload["movie_id"]
    aggregated[movie_id].append(r)

# Step B: For each movie, pick the chunk with the highest score
final_movie_results = []
for movie_id, chunk_list in aggregated.items():
    # Sort chunk_list in descending order by score
    chunk_list.sort(key=lambda x: x.score, reverse=True)
    best_chunk = chunk_list[0]  # the single best chunk
    final_movie_results.append(best_chunk)

# Step C: Sort the final list of best-chunk-per-movie by score again
final_movie_results.sort(key=lambda x: x.score, reverse=True)

# 5) Display Aggregated Results
####################################################################################

for i, result in enumerate(final_movie_results[:10]):
    # top 10 movie-level results
    movie_id = result.payload["movie_id"]
    chunk_text = result.payload["chunk_text"]
    score = result.score

    print(f"Rank {i+1}, Movie ID: {movie_id}, Score: {score:.3f}")
    print(f"  Chunk snippet: {chunk_text[:150]}...")
    print("-" * 60)



Rank 1, Movie ID: 2085.0, Score: 0.668
  Chunk snippet: are Action, Crime, Thriller...
------------------------------------------------------------
Rank 2, Movie ID: 12107.0, Score: 0.552
  Chunk snippet: action. And when Buddy gets loose, things get seriously nutty.. Movie genres are Fantasy, Comedy, Romance, Science, Fiction...
------------------------------------------------------------
Rank 3, Movie ID: 8584.0, Score: 0.522
  Chunk snippet: to spring the princess from her imprisonment.. Movie genres are Adventure, Action, Comedy, Western...
------------------------------------------------------------
Rank 4, Movie ID: 1907.0, Score: 0.505
  Chunk snippet: Romance, Thriller...
------------------------------------------------------------
Rank 5, Movie ID: 9532.0, Score: 0.496
  Chunk snippet: Movie name is Final Destination. Its description is After a teenager has a terrifying vision of him and his friends dying in a plane crash, he prevent...
----------------------------------------

Qdrant

In [24]:
# from qdrant_client import QdrantClient
# from qdrant_client.http.models import VectorParams, Distance

# def setup_collection(
#     collection_name: str = "movie_collection_semantic_retrieval",
#     vector_size: int = 768,  # match your embedding dimension
#     distance: str = "Cosine"
# ):
#     """
#     Creates or recreates a Qdrant collection with the specified size and distance metric.
#     """
#     client = QdrantClient(url="http://localhost:6333")  # Qdrant is in Docker
#     client.recreate_collection(
#         collection_name=collection_name,
#         vectors_config=VectorParams(size=vector_size, distance=distance)
#     )
#     print(f"Collection '{collection_name}' created/reset.")
#     return client


Main

In [None]:
# if __name__ == "__main__":
#     import pandas as pd
    
#     # Suppose your DataFrame is already in memory
#     # df_chunked has columns: 'movie_id', 'chunk_index', 'chunk_text'
    
#     # Step A: Create or reset the collection
#     from qdrant_client.http.models import VectorParams, Distance
#     qdrant_client = QdrantClient(url="http://localhost:6333")

#     # For sentence-transformers/all-mpnet-base-v2 => size=768
#     qdrant_client.recreate_collection(
#         collection_name="movie_collection_semantic_retrieval",
#         vectors_config=VectorParams(size=768, distance="Cosine")
#     )

#     # Step B: Upsert in batches (with Ray embedding)
#     batch_upsert(df_chunked, qdrant_client, "movie_collection_semantic_retrieval", batch_size=1000)
