In [1]:
# !pip install huggingface_hub

In [2]:
# !pip install datasets

In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import chromadb

In [2]:
# Checking MPS availability for Apple Silicon GPU acceleration
mps_available = torch.backends.mps.is_built() and torch.backends.mps.is_available()
device = "mps" if mps_available else "cpu"
print("Using device:", device)

Using device: mps


In [5]:
from huggingface_hub import HfApi

dataset_url = "SathvikVeerapaneni7/CineAI_Dataset"
api = HfApi()

In [None]:
# List files in the dataset repository
files = api.list_repo_files(repo_id=dataset_url, repo_type="dataset")
print(f"Files in the Hugging Face dataset '{dataset_url}':")
for f in files:
    print(f)

In [None]:
from datasets import load_dataset

# sample Loading a single Parquet file
dataset = load_dataset(
    "parquet",
    data_files={"train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"},
    split="train"
)

# Convert to DataFrame for processing if needed
import pandas as pd
details_df = pd.DataFrame(dataset)
print("Loaded details_df from HF:", details_df.shape)
details_df.head()

# Semantic Retrieval Pipeline Overview

Our goal is to build a Retrieval-Augmented Generation (RAG) component that allows quick, semantic-level access to movie data.  
We expect the outcome to be a system that, given a natural language query, retrieves relevant movie entries with thematic and narrative similarity.  


Embed textual fields like `title`, `overview`, and optionally `genres` because they contain rich narrative and thematic information.  

Not embedding purely numeric fields like `runtime` or `popularity` since they do not add semantic context. 

The final outcome should be a searchable index enabling users to discover related movies by concept rather than keyword.  
This improves upon keyword search by returning results semantically aligned with the user’s intent.  


Limitations include potential model bias if overviews are sparse or genres are missing.  
High-level concepts are captured, but rare details may be overlooked.  


Future improvements can involve more diverse embeddings or integrating metadata filters.  


Overall, this pipeline lays the foundation for robust semantic retrieval within the CineAI ecosystem.

In [8]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/details_df_clean.parquet"}
details_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
details_df = pd.DataFrame(details_dataset)


In [9]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/cast_df_clean.parquet"}
cast_df_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
cast_df = pd.DataFrame(cast_df_dataset)


In [10]:


details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/crew_df_clean.parquet"}
crew_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
crew_df = pd.DataFrame(crew_dataset)


In [11]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/recommendations_df_clean.parquet"}
recommendations_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
recommendations_df = pd.DataFrame(recommendations_dataset)


In [12]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/watch_providers_df_clean.parquet"}
watch_providers_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
watch_providers_df= pd.DataFrame(watch_providers_dataset)



In [13]:
dataframes=[details_df,cast_df,crew_df,recommendations_df,watch_providers_df]

In [None]:
for i in dataframes:
    print(i.shape)

In [None]:
for i, df in enumerate(dataframes):
    print(f"DataFrame {i+1} Columns:")
    print(df.columns)
    print("-" * 50)


### Embeddings Handling

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch


In [None]:
details_df.head()

In [18]:
# Cell 3: If 'text_for_embedding' doesn't exist, create it from 'title' and 'overview'.
# We will combine them as a fallback.
if 'text_for_embedding' not in details_df.columns:
    details_df['text_for_embedding'] = details_df['title'].fillna('') + ' ' + details_df['overview'].fillna('')

# Now concatenate genres_str if available
if 'genres_str' in details_df.columns:
    details_df['final_text'] = details_df['text_for_embedding'] + ' ' + details_df['genres_str'].fillna('')
else:
    details_df['final_text'] = details_df['text_for_embedding']


In [19]:
# Cell 4: Initialize the embedding model
import torch
print(torch.backends.mps.is_available())
device = "mps" if torch.backends.mps.is_available() else "cpu"

Model Selection

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

In [None]:
details_df['final_text'].shape

In [None]:
details_df

In [20]:
# Cell 5: Comput embeddings
embeddings = model.encode(details_df['final_text'].tolist(), convert_to_tensor=True)


In [None]:
print(type(embeddings))

### ChromaDB a Vector Database, to store the embeddgins

In [4]:
# We'll create a client with the new recommended configuration.
import chromadb
from chromadb.config import Settings

In [5]:
client = chromadb.Client(
    Settings(
        persist_directory="~/Desktop/CineAI/CineAI/chroma_db"
    )
)

In [6]:
# Get the current working directory
current_directory = os.getcwd()

# Construct the full path to the persist directory
full_path = os.path.join(current_directory, "chroma_db")

# Print the full path
print("Full path to the ChromaDB persist directory:", full_path)

Full path to the ChromaDB persist directory: /Users/HVMS/Desktop/CineAI/CineAI/src/embeddings/chroma_db


In [14]:
collections = client.list_collections()
print(collections)

[Collection(name=movies_new), Collection(name=movies)]


In [12]:
!ls -la ~/Desktop/CineAI/CineAI/chroma_db

ls: /Users/HVMS/Desktop/CineAI/CineAI/chroma_db: No such file or directory


In [13]:
# Cell 2: Create or get the collection
collection = client.get_or_create_collection(name="movies")
# Cell 2: Create or get the collection
collection = client.get_or_create_collection(name="movies_new")

In [None]:
# Cell 1: Remove duplicate movie_ids
details_df = details_df.drop_duplicates(subset='movie_id', keep='first')

In [27]:
# Cell 1: Define a function to chunk the data
def chunk_data(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i+chunk_size]

# We'll create batches for ids, documents, and embeddings
ids = details_df['movie_id'].astype(str).tolist()
docs = details_df['text_for_embedding'].tolist()
embs = embeddings.tolist()

# According to the error, max batch size is 41666
max_batch_size = 3000

In [None]:
# Cell 2: Add documents in chunks
for id_chunk, doc_chunk, emb_chunk in zip(
    chunk_data(ids, max_batch_size),
    chunk_data(docs, max_batch_size),
    chunk_data(embs, max_batch_size)
):
    collection.add(
        documents=doc_chunk,
        ids=id_chunk,
        embeddings=emb_chunk
    )

In [None]:
# Perform a sample semantic query
query_text = "What is the actor real name in iron man movie "
results = collection.query(
    query_texts=[query_text],
    n_results=2
)

results


### Sample Testing, with random data , to verify logs and embedding storgare in paths

In [37]:
import logging
import os

# Set up logging configuration
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
from chromadb import Client, Settings
import os

try:
    # Adjust the directory name slightly to avoid conflict
    client = Client(
        Settings(
            persist_directory=("/Users/HVMS/Desktop/CineAI/CineAI/src/embeddings", "sample_embeddings_new")
        )
    )
    logging.debug("Initialized new ChromaDB client with a new directory: %s", os.path.join(os.getcwd(), "sample_embeddings_new"))
except ValueError as e:
    logging.error("Failed to initialize ChromaDB client: %s", e)

