<a href="https://colab.research.google.com/github/TharinduMadhusanka/semantic-movie-search/blob/main/Create%20vector%20database/Embedding_TMDB_movie_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TMDB top 10,000 movies to embeded vectors


In [None]:
!pip install -q chromadb sentence_transformers

In [None]:
# setup kaggle.json to directly download data or directly upload to colab
!mkdir -p ~/.kaggle
# !cp {your kaggle json} ~/.kaggle/
# in my case, I put my kaggle json in drive
!cp /content/drive/MyDrive/Machine_Learning/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download and unzip dataset
!kaggle datasets download -d ursmaheshj/top-10000-popular-movies-tmdb-05-2023
!unzip /content/top-10000-popular-movies-tmdb-05-2023.zip

Dataset URL: https://www.kaggle.com/datasets/ursmaheshj/top-10000-popular-movies-tmdb-05-2023
License(s): CC0-1.0
top-10000-popular-movies-tmdb-05-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# import libraries
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
import torch

In [None]:
# GPU Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
file_name = "top_1000_popular_movies_tmdb.csv"
df = pd.read_csv(file_name,lineterminator='\n')

In [None]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,0,385687,Fast X,2023-05-17,"['Action', 'Crime', 'Thriller']",English,7.4,1347,8363.473,Over many missions and against impossible odds...,340000000,"['Universal Pictures', 'Original Film', 'One R...",652000000,142,The end of the road begins.


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            10000 non-null  int64  
 1   id                    10000 non-null  int64  
 2   title                 10000 non-null  object 
 3   release_date          9977 non-null   object 
 4   genres                10000 non-null  object 
 5   original_language     10000 non-null  object 
 6   vote_average          10000 non-null  float64
 7   vote_count            10000 non-null  int64  
 8   popularity            10000 non-null  float64
 9   overview              9924 non-null   object 
 10  budget                10000 non-null  int64  
 11  production_companies  10000 non-null  object 
 12  revenue               10000 non-null  int64  
 13  runtime               10000 non-null  int64  
 14  tagline               7383 non-null   object 
dtypes: float64(2), int64

In [None]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'release_date', 'genres',
       'original_language', 'vote_average', 'vote_count', 'popularity',
       'overview', 'budget', 'production_companies', 'revenue', 'runtime',
       'tagline'],
      dtype='object')

In [None]:
df = df.drop(columns=['Unnamed: 0','original_language', 'vote_average', 'vote_count', 'budget', 'production_companies', 'revenue'])

In [None]:
df = df.copy()

# Convert release_date to datetime and extract the year
df.loc[:, 'release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
df.loc[:, 'release_year'] = df['release_year'].fillna(0).astype(int)

df.drop(columns=['release_date'], inplace=True)
df['release_year'] = df['release_year'].astype(int)

In [None]:
df['genres'] = df['genres'].apply(eval).apply(lambda x: ', '.join(x))


In [None]:
df["overview"] = df.apply(
    lambda row: (row["overview"] if pd.notna(row["overview"]) else "") +
                (" " + row["tagline"] if pd.notna(row["tagline"]) else ""),
    axis=1
)
df = df.drop(columns=['tagline'])

In [None]:
df.isnull().sum()

id              0
title           0
genres          0
popularity      0
overview        0
runtime         0
release_year    0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10000 non-null  int64  
 1   title         10000 non-null  object 
 2   genres        10000 non-null  object 
 3   popularity    10000 non-null  float64
 4   overview      10000 non-null  object 
 5   runtime       10000 non-null  int64  
 6   release_year  10000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 547.0+ KB


In [None]:
ids       = df["id"].astype(str).tolist()
documents = df["overview"].tolist()
metadatas = df[['title', 'genres', 'popularity', 'release_year', 'runtime']].to_dict(orient='records')

In [None]:
# Create a Chroma Collection
chroma_client = chromadb.PersistentClient(path="tmdbtopmovies")

# choose any available embedding function. Here I select sentence transformer "all-MiniLM-L12-v2"
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L12-v2", device="cuda")

collection = chroma_client.get_or_create_collection(
    name="movies_collection", embedding_function=sentence_transformer_ef)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Add documents to the collection
collection.add(
    documents=documents,
    ids=ids,
    metadatas=metadatas
)

In [None]:
# Check the database with example quiery
keytext = ["our mind is the scene of the crime. The dream is real."]
results = collection.query(query_texts=keytext, n_results=20)

for result in results["metadatas"][0]:
  print(result['title'])

Inception
Demonic
Before I Wake
Altered States
Malignant
Wildflower
The Cell
The City of Lost Children
Sex Plate 17
Gothika
Waking Life
The Howling
P.O. Box Tinto Brass
Bottle Rocket
Memory
The Hills Run Red
Brazil
Cobra
The Babysitter
Slumberland
