In [10]:
# !pip install huggingface_hub

In [15]:
# !pip install datasets

In [3]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import chromadb

In [12]:
# Checking MPS availability for Apple Silicon GPU acceleration
mps_available = torch.backends.mps.is_built() and torch.backends.mps.is_available()
device = "mps" if mps_available else "cpu"
print("Using device:", device)

Using device: mps


In [17]:
from huggingface_hub import HfApi

dataset_url = "SathvikVeerapaneni7/CineAI_Dataset"
api = HfApi()

In [18]:
# List files in the dataset repository
files = api.list_repo_files(repo_id=dataset_url, repo_type="dataset")
print(f"Files in the Hugging Face dataset '{dataset_url}':")
for f in files:
    print(f)

Files in the Hugging Face dataset 'SathvikVeerapaneni7/CineAI_Dataset':
.gitattributes
README.md
parquet_files/cast_df_clean.parquet
parquet_files/crew_df_clean.parquet
parquet_files/details_df_clean.parquet
parquet_files/recommendations_df_clean.parquet
parquet_files/watch_providers_df_clean.parquet


In [None]:
from datasets import load_dataset

# sample Loading a single Parquet file
dataset = load_dataset(
    "parquet",
    data_files={"train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"},
    split="train"
)

# Convert to DataFrame for processing if needed
import pandas as pd
details_df = pd.DataFrame(dataset)
print("Loaded details_df from HF:", details_df.shape)
details_df.head()

details_df_clean.parquet:   0%|          | 0.00/51.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loaded details_df from HF: (230586, 8)


Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


# Semantic Retrieval Pipeline Overview

Our goal is to build a Retrieval-Augmented Generation (RAG) component that allows quick, semantic-level access to movie data.  
We expect the outcome to be a system that, given a natural language query, retrieves relevant movie entries with thematic and narrative similarity.  


Embed textual fields like `title`, `overview`, and optionally `genres` because they contain rich narrative and thematic information.  

Not embedding purely numeric fields like `runtime` or `popularity` since they do not add semantic context. 

The final outcome should be a searchable index enabling users to discover related movies by concept rather than keyword.  
This improves upon keyword search by returning results semantically aligned with the user’s intent.  


Limitations include potential model bias if overviews are sparse or genres are missing.  
High-level concepts are captured, but rare details may be overlooked.  


Future improvements can involve more diverse embeddings or integrating metadata filters.  


Overall, this pipeline lays the foundation for robust semantic retrieval within the CineAI ecosystem.

In [25]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/details_df_clean.parquet"}
details_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
details_df = pd.DataFrame(details_dataset)


In [26]:
details_df.shape

(230586, 8)

In [27]:
details_df.head(5)

Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


Embedding

In [47]:
# Prepare text for embedding, including genres_str
if 'genres' in details_df.columns:
    details_df['genres_str'] = details_df['genres'].apply(lambda g: " ".join(g) if isinstance(g, list) else "")
    fields_to_embed = ['title', 'overview', 'genres_str']
else:
    fields_to_embed = ['title', 'overview']

details_df['text_for_embedding'] = details_df[fields_to_embed].fillna('').agg(' '.join, axis=1)

In [62]:
# Adjust display settings to show more rows and columns
pd.set_option('display.max_rows', 10)  # Set to desired number of rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 10)  # Adjust width to fit content

In [69]:
for i in range(5):
    print(details_df['text_for_embedding'][i])


Gladiator In the year 180, the death of Emperor Marcus Aurelius throws the Roman Empire into chaos. Maximus is one of the Roman army's most capable and trusted generals, as well as a key advisor to the emperor. As Marcus' devious son Commodus ascends to the throne, Maximus is sentenced to execution. He escapes but is captured by slave traders. Renamed "Spaniard" and forced to become a gladiator, Maximus must battle to the death against other men for the amusement of paying audiences.
How the Grinch Stole Christmas The Grinch decides to rob Whoville of Christmas - but a dash of kindness from little Cindy Lou Who and her family may be enough to melt his heart...
Chicken Run The creators of Wallace & Gromit bring you an exciting and original story about a group of chickens determined to fly the coop–even if they can’t fly! It’s hardly poultry in motion when Rocky attempts to teach Ginger and her feathered friends to fly…but, with teamwork, determination and a little bit o’ cluck, the fear

In [29]:
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device=device)
print(f"Model {model_name} initialized on {device}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model all-MiniLM-L6-v2 initialized on mps


In [30]:
texts = details_df['text_for_embedding'].tolist()
embeddings = model.encode(texts, convert_to_numpy=True)
print("Generated embeddings shape:", embeddings.shape)

Generated embeddings shape: (230586, 384)


In [46]:
# Re-create the collection if deleted
collection = client.create_collection(collection_name)
print(f"Collection '{collection_name}' has been created.")

Collection 'movies_semantic' has been created.


In [32]:
collection = client.create_collection(collection_name)
print("Created collection:", collection_name)

Created collection: movies_semantic


In [34]:
movie_id_str = str(doc_id) if doc_id is not None else ""
title_str = str(row['title']) if pd.notnull(row['title']) else ""

metadata = {"movie_id": movie_id_str, "title": title_str}

In [35]:
metadata = {}
metadata['movie_id'] = str(doc_id) if doc_id else ""
metadata['title'] = str(row['title']) if pd.notnull(row['title']) else ""

In [36]:
for i, row in details_df.iterrows():
    doc_id = str(int(row['movie_id'])) if pd.notnull(row['movie_id']) else str(i)
    text = row['text_for_embedding']
    embedding = embeddings[i]

    # Ensure metadata values are strings and not None
    movie_id_str = str(doc_id)
    title_str = str(row['title']) if pd.notnull(row['title']) else ""

    metadata = {"movie_id": movie_id_str, "title": title_str}

    collection.add(documents=[text], embeddings=[embedding], metadatas=[metadata], ids=[doc_id])

Insert of existing embedding ID: 98
Add of existing embedding ID: 98
Insert of existing embedding ID: 8871
Add of existing embedding ID: 8871
Insert of existing embedding ID: 7443
Add of existing embedding ID: 7443
Insert of existing embedding ID: 9532
Add of existing embedding ID: 9532
Insert of existing embedding ID: 77
Add of existing embedding ID: 77
Insert of existing embedding ID: 8358
Add of existing embedding ID: 8358
Insert of existing embedding ID: 22705
Add of existing embedding ID: 22705
Insert of existing embedding ID: 10867
Add of existing embedding ID: 10867
Insert of existing embedding ID: 4247
Add of existing embedding ID: 4247
Insert of existing embedding ID: 11688
Add of existing embedding ID: 11688
Insert of existing embedding ID: 10567
Add of existing embedding ID: 10567
Insert of existing embedding ID: 2123
Add of existing embedding ID: 2123
Insert of existing embedding ID: 2024
Add of existing embedding ID: 2024
Insert of existing embedding ID: 1359
Add of existi

In [38]:
def semantic_search(query, top_k=5):
    query_embedding = model.encode([query])[0]
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results

In [42]:
test_query = "films packed with action and comedy"
search_results = semantic_search(test_query, top_k=3)

print(f"Top 3 results for query: '{test_query}'")
for doc, meta in zip(search_results['documents'][0], search_results['metadatas'][0]):
    print("Title:", meta['title'])
    print("Snippet:", doc[:200], "...")
    print("-"*40)

InvalidCollectionException: Collection cb2f0574-7503-4dbe-b99a-b35b9314b165 does not exist.

### To delete exisited embeddgins

In [None]:
# client.delete_collection(name=collection_name)
# print(f"Collection '{collection_name}' has been deleted.")