In [5]:
# !pip install huggingface_hub

In [6]:
# !pip install datasets

In [7]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import chromadb

In [8]:
# Checking MPS availability for Apple Silicon GPU acceleration
mps_available = torch.backends.mps.is_built() and torch.backends.mps.is_available()
device = "mps" if mps_available else "cpu"
print("Using device:", device)

Using device: mps


In [9]:
from huggingface_hub import HfApi

dataset_url = "SathvikVeerapaneni7/CineAI_Dataset"
api = HfApi()

In [10]:
# List files in the dataset repository
files = api.list_repo_files(repo_id=dataset_url, repo_type="dataset")
print(f"Files in the Hugging Face dataset '{dataset_url}':")
for f in files:
    print(f)

Files in the Hugging Face dataset 'SathvikVeerapaneni7/CineAI_Dataset':
.gitattributes
README.md
parquet_files/cast_df_clean.parquet
parquet_files/crew_df_clean.parquet
parquet_files/details_df_clean.parquet
parquet_files/recommendations_df_clean.parquet
parquet_files/watch_providers_df_clean.parquet


### Sample Loading of details_df from hugging face parquet files

In [11]:
from datasets import load_dataset

# sample Loading a single Parquet file
dataset = load_dataset(
    "parquet",
    data_files={"train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"},
    split="train"
)

In [12]:
details_df = pd.DataFrame(dataset)
print("Loaded details_df from HF:", details_df.shape)
details_df.head()

Loaded details_df from HF: (230586, 8)


Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [13]:
details_df.shape

(230586, 8)

# Semantic Retrieval Pipeline Overview

Our goal is to build a Retrieval-Augmented Generation (RAG) component that allows quick, semantic-level access to movie data.  
We expect the outcome to be a system that, given a natural language query, retrieves relevant movie entries narrative similarity.  


Embed textual fields like `title`, `overview`, and optionally `genres` because they contain rich narrative and thematic information.  

Not embedding purely numeric fields like `runtime` or `popularity` since they do not add semantic context. 

The final outcome should be a searchable index enabling users to discover related movies by concept rather than keyword.  
This improves upon keyword search by returning results semantically aligned with the user’s intent.  


Limitations include potential model bias if overviews are sparse or genres are missing.  
High-level concepts are captured, but rare details may be overlooked.  


Future improvements can involve more diverse embeddings or integrating metadata filters.  


Overall, this pipeline lays the foundation for robust semantic retrieval within the CineAI ecosystem.

### Loading all the files in respective dataframes, df

In [14]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/details_df_clean.parquet"}
details_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
details_df = pd.DataFrame(details_dataset)


In [15]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/cast_df_clean.parquet"}
cast_df_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
cast_df = pd.DataFrame(cast_df_dataset)

In [16]:


details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/crew_df_clean.parquet"}
crew_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
crew_df = pd.DataFrame(crew_dataset)

In [17]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/recommendations_df_clean.parquet"}
recommendations_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
recommendations_df = pd.DataFrame(recommendations_dataset)

In [18]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/watch_providers_df_clean.parquet"}
watch_providers_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
watch_providers_df= pd.DataFrame(watch_providers_dataset)

In [19]:
dataframes=[details_df,cast_df,crew_df,recommendations_df,watch_providers_df]

In [20]:
for i in dataframes:
    print(i.shape)

(230586, 8)
(2183780, 5)
(2477256, 6)
(2790336, 5)
(412381, 4)


In [21]:
for i, df in enumerate(dataframes):
    print(f"DataFrame {i+1} Columns:")
    print(df.columns)
    print("-" * 50)


DataFrame 1 Columns:
Index(['movie_id', 'title', 'overview', 'release_date', 'runtime',
       'original_language', 'popularity', 'genres_str'],
      dtype='object')
--------------------------------------------------
DataFrame 2 Columns:
Index(['movie_id', 'cast_id', 'name', 'character', 'credit_id'], dtype='object')
--------------------------------------------------
DataFrame 3 Columns:
Index(['movie_id', 'crew_id', 'name', 'department', 'job', 'credit_id'], dtype='object')
--------------------------------------------------
DataFrame 4 Columns:
Index(['movie_id', 'recommended_movie_id', 'recommended_title',
       'recommended_popularity', 'recommended_genre_ids'],
      dtype='object')
--------------------------------------------------
DataFrame 5 Columns:
Index(['movie_id', 'provider_type', 'provider_name', 'link'], dtype='object')
--------------------------------------------------


### Embeddings Handling

In [22]:
from sentence_transformers import SentenceTransformer
import torch


In [23]:
details_df.head()

Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [24]:
# Cell 3: If 'text_for_embedding' doesn't exist, create it from 'title' and 'overview'.
# We will combine them as a fallback.

if 'text_for_embedding' not in details_df.columns:
    details_df['text_for_embedding'] = details_df['title'].fillna('') + ' ' + details_df['overview'].fillna('')

# Now concatenate genres_str if available
if 'genres_str' in details_df.columns:
    details_df['final_text'] = details_df['text_for_embedding'] + ' ' + details_df['genres_str'].fillna('')
else:
    details_df['final_text'] = details_df['text_for_embedding']


In [25]:
# Cell 4: Initialize the embedding model
import torch
print(torch.backends.mps.is_available())
device = "mps" if torch.backends.mps.is_available() else "cpu"

True


### Model Selection

In [26]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

In [27]:
details_df['final_text'].shape

(230586,)

In [28]:
details_df

Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str,text_for_embedding,final_text
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure'],"Gladiator In the year 180, the death of Empero...","Gladiator In the year 180, the death of Empero..."
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy'],How the Grinch Stole Christmas The Grinch deci...,How the Grinch Stole Christmas The Grinch deci...
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family'],Chicken Run The creators of Wallace & Gromit b...,Chicken Run The creators of Wallace & Gromit b...
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror'],Final Destination After a teenager has a terri...,Final Destination After a teenager has a terri...
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller'],Memento Leonard Shelby is tracking down the ma...,Memento Leonard Shelby is tracking down the ma...
...,...,...,...,...,...,...,...,...,...,...
230581,1183860.0,Precognito,A premonition inducing drug could cost a group...,2023-08-18,78.0,en,1.196,['Science Fiction'],Precognito A premonition inducing drug could c...,Precognito A premonition inducing drug could c...
230582,1141821.0,Giddh (The Scavenger),An old man is compelled to choose an unlikely ...,2023-12-01,25.0,hi,1.196,['Drama'],Giddh (The Scavenger) An old man is compelled ...,Giddh (The Scavenger) An old man is compelled ...
230583,1138889.0,Gulaam Chor,Twelve people who gather to gamble at a house ...,2023-06-11,113.0,gu,1.196,['Drama'],Gulaam Chor Twelve people who gather to gamble...,Gulaam Chor Twelve people who gather to gamble...
230584,1211829.0,Due battiti,,2023-11-29,0.0,it,1.196,['Animation'],Due battiti,Due battiti ['Animation']


In [38]:
final_text_df = details_df[['final_text']]

In [None]:
final_text_df

KeyboardInterrupt: 

In [None]:
print(type(embeddings))

### ChromaDB a Vector Database, to store the embeddgins

In [31]:
import chromadb
from chromadb.config import Settings

In [32]:
client = chromadb.Client(Settings(
    persist_directory="/Users/HVMS/Desktop/CineAI/CineAI/embeddings_db"
))

In [35]:
collection = client.get_or_create_collection(name="movies")

In [36]:
# Chuking the Data
def chunk_data(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i+chunk_size]


In [None]:
ids = details_df['movie_id'].astype(str).tolist()
docs = details_df['text_for_embedding'].tolist()
embs = embeddings.tolist()

max_batch_size = 40000

In [37]:
for id_chunk, doc_chunk, emb_chunk in zip(
    chunk_data(ids, max_batch_size),
    chunk_data(docs, max_batch_size),
    chunk_data(embs, max_batch_size)
):
    collection.add(
        documents=doc_chunk,
        ids=id_chunk,
        embeddings=emb_chunk
    )

NameError: name 'ids' is not defined

### Sample Testing, with random data , to verify logs and embedding storgare in paths

In [37]:
import logging
import os

# Set up logging configuration
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
from chromadb import Client, Settings
import os

try:
    # Adjust the directory name slightly to avoid conflict
    client = Client(
        Settings(
            persist_directory=("/Users/HVMS/Desktop/CineAI/CineAI/src/embeddings", "sample_embeddings_new")
        )
    )
    logging.debug("Initialized new ChromaDB client with a new directory: %s", os.path.join(os.getcwd(), "sample_embeddings_new"))
except ValueError as e:
    logging.error("Failed to initialize ChromaDB client: %s", e)

