In [3]:
# !pip install huggingface_hub

In [4]:
# !pip install datasets

In [60]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

In [6]:
# Checking MPS availability for Apple Silicon GPU acceleration
mps_available = torch.backends.mps.is_built() and torch.backends.mps.is_available()
device = "mps" if mps_available else "cpu"
print("Using device:", device)

Using device: mps


In [7]:
from huggingface_hub import HfApi

dataset_url = "SathvikVeerapaneni7/CineAI_Dataset"
api = HfApi()

In [8]:
# List files in the dataset repository
files = api.list_repo_files(repo_id=dataset_url, repo_type="dataset")
print(f"Files in the Hugging Face dataset '{dataset_url}':")
for f in files:
    print(f)

Files in the Hugging Face dataset 'SathvikVeerapaneni7/CineAI_Dataset':
.gitattributes
README.md
parquet_files/cast_df_clean.parquet
parquet_files/crew_df_clean.parquet
parquet_files/details_df_clean.parquet
parquet_files/recommendations_df_clean.parquet
parquet_files/watch_providers_df_clean.parquet


### Sample Loading of details_df from hugging face parquet files

In [9]:
from datasets import load_dataset

# sample Loading a single Parquet file
dataset = load_dataset(
    "parquet",
    data_files={"train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"},
    split="train"
)

In [10]:
details_df = pd.DataFrame(dataset)
print("Loaded details_df from HF:", details_df.shape)
details_df.head()

Loaded details_df from HF: (230586, 8)


Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [11]:
details_df.shape

(230586, 8)

# Semantic Retrieval Pipeline Overview

Our goal is to build a Retrieval-Augmented Generation (RAG) component that allows quick, semantic-level access to movie data.  
We expect the outcome to be a system that, given a natural language query, retrieves relevant movie entries narrative similarity.  


Embed textual fields like `title`, `overview`, and optionally `genres` because they contain rich narrative and thematic information.  

Not embedding purely numeric fields like `runtime` or `popularity` since they do not add semantic context. 

The final outcome should be a searchable index enabling users to discover related movies by concept rather than keyword.  
This improves upon keyword search by returning results semantically aligned with the user’s intent.  


Limitations include potential model bias if overviews are sparse or genres are missing.  
High-level concepts are captured, but rare details may be overlooked.  


Future improvements can involve more diverse embeddings or integrating metadata filters.  


Overall, this pipeline lays the foundation for robust semantic retrieval within the CineAI ecosystem.

### Loading all the files in respective dataframes, df

In [12]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/details_df_clean.parquet"}
details_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
details_df = pd.DataFrame(details_dataset)


In [13]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/cast_df_clean.parquet"}
cast_df_dataset = load_dataset("parquet", data_files=details_data_files, split='train')
cast_df = pd.DataFrame(cast_df_dataset)

In [14]:


details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/crew_df_clean.parquet"}
crew_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
crew_df = pd.DataFrame(crew_dataset)

In [15]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/recommendations_df_clean.parquet"}
recommendations_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
recommendations_df = pd.DataFrame(recommendations_dataset)

In [16]:
details_data_files = {"train": f"https://huggingface.co/datasets/{dataset_url}/resolve/main/parquet_files/watch_providers_df_clean.parquet"}
watch_providers_dataset= load_dataset("parquet", data_files=details_data_files, split='train')
watch_providers_df= pd.DataFrame(watch_providers_dataset)

In [17]:
dataframes=[details_df,cast_df,crew_df,recommendations_df,watch_providers_df]

In [18]:
for i in dataframes:
    print(i.shape)

(230586, 8)
(2183780, 5)
(2477256, 6)
(2790336, 5)
(412381, 4)


In [19]:
for i, df in enumerate(dataframes):
    print(f"DataFrame {i+1} Columns:")
    print(df.columns)
    print("-" * 50)


DataFrame 1 Columns:
Index(['movie_id', 'title', 'overview', 'release_date', 'runtime',
       'original_language', 'popularity', 'genres_str'],
      dtype='object')
--------------------------------------------------
DataFrame 2 Columns:
Index(['movie_id', 'cast_id', 'name', 'character', 'credit_id'], dtype='object')
--------------------------------------------------
DataFrame 3 Columns:
Index(['movie_id', 'crew_id', 'name', 'department', 'job', 'credit_id'], dtype='object')
--------------------------------------------------
DataFrame 4 Columns:
Index(['movie_id', 'recommended_movie_id', 'recommended_title',
       'recommended_popularity', 'recommended_genre_ids'],
      dtype='object')
--------------------------------------------------
DataFrame 5 Columns:
Index(['movie_id', 'provider_type', 'provider_name', 'link'], dtype='object')
--------------------------------------------------


### Embeddings Handling

In [20]:
from sentence_transformers import SentenceTransformer
import torch


In [21]:
details_df.head()

Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [22]:
# Cell 3: If 'text_for_embedding' doesn't exist, create it from 'title' and 'overview'.
# We will combine them as a fallback.

if 'text_for_embedding' not in details_df.columns:
    details_df['text_for_embedding'] = details_df['title'].fillna('') + ' ' + details_df['overview'].fillna('')

# Now concatenate genres_str if available
if 'genres_str' in details_df.columns:
    details_df['final_text'] = details_df['text_for_embedding'] + ' ' + details_df['genres_str'].fillna('')
else:
    details_df['final_text'] = details_df['text_for_embedding']


In [23]:
# Cell 4: Initialize the embedding model
import torch
print(torch.backends.mps.is_available())
device = "mps" if torch.backends.mps.is_available() else "cpu"

True


### Model Selection

In [24]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

In [25]:
details_df['final_text'].shape

(230586,)

In [26]:
details_df

Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str,text_for_embedding,final_text
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure'],"Gladiator In the year 180, the death of Empero...","Gladiator In the year 180, the death of Empero..."
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy'],How the Grinch Stole Christmas The Grinch deci...,How the Grinch Stole Christmas The Grinch deci...
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family'],Chicken Run The creators of Wallace & Gromit b...,Chicken Run The creators of Wallace & Gromit b...
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror'],Final Destination After a teenager has a terri...,Final Destination After a teenager has a terri...
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller'],Memento Leonard Shelby is tracking down the ma...,Memento Leonard Shelby is tracking down the ma...
...,...,...,...,...,...,...,...,...,...,...
230581,1183860.0,Precognito,A premonition inducing drug could cost a group...,2023-08-18,78.0,en,1.196,['Science Fiction'],Precognito A premonition inducing drug could c...,Precognito A premonition inducing drug could c...
230582,1141821.0,Giddh (The Scavenger),An old man is compelled to choose an unlikely ...,2023-12-01,25.0,hi,1.196,['Drama'],Giddh (The Scavenger) An old man is compelled ...,Giddh (The Scavenger) An old man is compelled ...
230583,1138889.0,Gulaam Chor,Twelve people who gather to gamble at a house ...,2023-06-11,113.0,gu,1.196,['Drama'],Gulaam Chor Twelve people who gather to gamble...,Gulaam Chor Twelve people who gather to gamble...
230584,1211829.0,Due battiti,,2023-11-29,0.0,it,1.196,['Animation'],Due battiti,Due battiti ['Animation']


In [27]:
final_text_df = details_df[['final_text']]

In [28]:
final_text_df.shape

(230586, 1)

### Parllel Processing and Multithread for Creating Embeddgins

### Installing Ray Framework

In [29]:

# !pip install ray

In [30]:
import os
import ray
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer


### Intaializing ray with dashboard for monitoring

In [31]:
# Start Ray. If you want the dashboard and have ray[default] installed:
ray.shutdown()
ray.init(include_dashboard=True)  # If you face issues with dashboard, use: ray.init()


2024-12-18 23:08:05,803	INFO worker.py:1812 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.16
Ray version:,2.40.0
Dashboard:,http://127.0.0.1:8265


In [32]:
# Assuming final_text_df is already created and has 'final_text' column.
# Example:
# final_text_df = pd.DataFrame([...], columns=['final_text'])

# Convert the column to a list for embedding
final_texts = final_text_df['final_text'].tolist()

print("Number of texts:", len(final_texts))


Number of texts: 230586


In [33]:
def chunk_data(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i+chunk_size]

# Adjust chunk size based on memory and performance tests
batch_size = 5000
batches = list(chunk_data(final_texts, batch_size))
print("Number of batches:", len(batches))

Number of batches: 47


In [34]:
@ray.remote
class EmbedActor:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        device = "mps" if torch.backends.mps.is_available() else "cpu"
        self.model = SentenceTransformer(model_name, device=device)

    def embed_texts(self, texts):
        # texts is a list of strings
        # Return a numpy array of embeddings
        return self.model.encode(texts, convert_to_tensor=True).cpu().numpy()

In [35]:
# Create an instance of the actor
embed_actor = EmbedActor.remote()

# Submit all batches for processing
futures = [embed_actor.embed_texts.remote(batch) for batch in batches]

# Retrieve results
results = ray.get(futures)

# Combine all embeddings into a single matrix
embeddings = np.vstack(results)
print("Embeddings shape:", embeddings.shape)


Embeddings shape: (230586, 384)


In [36]:
# !pip install tensorboard

In [50]:
import numpy as np
import os

# 1. Ensure the embeddings are in a variable `embeddings`
# embeddings shape: (230586, 384)
# Example: you already have `embeddings` from previous steps

# 2. (Optional) If you have a list of movie titles or IDs to label each embedding:
# Suppose you have a DataFrame `details_df` with 'title' and the same number of rows as embeddings
titles = details_df['title'].fillna('Unknown').tolist()

# 3. Create a logs directory for TensorBoard if it doesn't exist
log_dir = "embeddings_logs"
os.makedirs(log_dir, exist_ok=True)

# 4. Save embeddings to a TSV file (tab-separated values)
embeddings_file = os.path.join(log_dir, "embeddings.tsv")
np.savetxt(embeddings_file, embeddings, delimiter="\t")

# 5. Save metadata (labels) to a TSV file
metadata_file = os.path.join(log_dir, "metadata.tsv")
with open(metadata_file, "w", encoding="utf-8") as f:
    for title in titles:
        f.write(f"{title}\n")

# 6. Create the projector_config.pbtxt file
config_file = os.path.join(log_dir, "projector_config.pbtxt")
with open(config_file, "w") as f:
    f.write("""embeddings {
  tensor_name: "movie_embeddings"
  tensor_path: "embeddings.tsv"
  metadata_path: "metadata.tsv"
}""")

In [54]:
" tensorboard --logdir=/Users/HVMS/Desktop/CineAI/CineAI/src/embeddings/embeddings_logs --port=6006 "

#Run this command in the active terminal

' tensorboard --logdir=/Users/HVMS/Desktop/CineAI/CineAI/src/embeddings/embeddings_logs --port=6006 '

You won’t see embeddings on the main page. At the top of the page, look for a "Projector" tab. Click it. You should see a dropdown labeled "movie_embeddings" and be able to explore your embeddings interactively.