In [26]:
import pandas as pd
import numpy as np

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

import gradio as gr

In [None]:
movies = pd.read_csv("data\movies_filtered.csv")
movies["thumbnail"] = movies["Poster_Link"] + "&fife=w800"
movies["thumbnail"] = np.where(
    movies["thumbnail"].isna(),
    "data\poster-not-found.jpg",
    movies["thumbnail"],
)

In [None]:
raw_documents = TextLoader("data\overview_with_code.txt").load()
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=0, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
movies_db = Chroma.from_documents(documents, embedding=embeddings)

Created a chunk of size 122, which is longer than the specified 0
Created a chunk of size 114, which is longer than the specified 0
Created a chunk of size 193, which is longer than the specified 0
Created a chunk of size 168, which is longer than the specified 0
Created a chunk of size 117, which is longer than the specified 0
Created a chunk of size 150, which is longer than the specified 0
Created a chunk of size 147, which is longer than the specified 0
Created a chunk of size 182, which is longer than the specified 0
Created a chunk of size 156, which is longer than the specified 0
Created a chunk of size 137, which is longer than the specified 0
Created a chunk of size 156, which is longer than the specified 0
Created a chunk of size 239, which is longer than the specified 0
Created a chunk of size 142, which is longer than the specified 0
Created a chunk of size 182, which is longer than the specified 0
Created a chunk of size 192, which is longer than the specified 0
Created a 

In [30]:
def clean_genre_string(genre_str):
    cleaned = genre_str.replace("[", "").replace("]", "").replace("'", "").replace('"', "")
    return [g.strip() for g in cleaned.split(",") if g.strip()]

movies["genres"] = movies["mapped_genres"].apply(clean_genre_string)

all_genres = sorted({genre for sublist in movies["genres"] for genre in sublist})
print("Cleaned Genres:", all_genres)

all_certificates = sorted(movies['Certificate'].dropna().unique().tolist())

Cleaned Genres: ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Mystery', 'Other', 'Romance', 'Sci-Fi', 'Thriller', 'War']


In [32]:
movies.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,IMDB_Rating,Overview,code,overview_with_code,mapped_genres,thumbnail,genres
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,9.3,Two imprisoned men bond over a number of years...,349,349 Two imprisoned men bond over a number of y...,['Drama'],https://m.media-amazon.com/images/M/MV5BMDFkYT...,[Drama]
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,9.2,An organized crime dynasty's aging patriarch t...,731,731 An organized crime dynasty's aging patriar...,"['Crime', ' Drama']",https://m.media-amazon.com/images/M/MV5BM2MyNj...,"[Crime, Drama]"


In [35]:
movies.to_csv('movies_filtered.csv',index=False)

In [39]:
def semantic_search(query:str,
                    min_rating, 
                    selected_genres, 
                    selected_certificates
) -> pd.DataFrame:
    recs = movies_db.similarity_search(query, k = 50)
    
    movies_list = []

    for i in range(0, len(recs)):
        movies_list += [int(recs[i].page_content.strip('"').split()[0])]
    
    filtered_movies = movies[movies["code"].isin(movies_list)]

    filtered_movies = filtered_movies[filtered_movies['IMDB_Rating'] >= min_rating]

    if selected_genres:
        filtered_movies = filtered_movies[
            filtered_movies['genres'].apply(
                lambda g: any(genre.strip() in g for genre in selected_genres)
    )
]
    if selected_certificates:
        filtered_movies = filtered_movies[
            filtered_movies['Certificate'].isin(selected_certificates)
        ]
    
    filtered_movies["thumbnail"] = filtered_movies["thumbnail"].apply(
        lambda x: f"<img src='{x}' width='100'/>"
    )

    result =  filtered_movies[['thumbnail', 'Series_Title', 'Runtime', 'IMDB_Rating', 'Released_Year', 'genres']]

    return result.to_html(escape=False,index=False)

In [41]:
import gradio as gr

iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(label="Enter an Overview Query", placeholder="Describe the movie you're looking for..."),
        gr.Slider(minimum=0, maximum=10, step=0.1, label="Minimum Rating", value=5.0),
        gr.CheckboxGroup(choices=all_genres, label="Select Genre(s)"),
        gr.CheckboxGroup(choices=all_certificates, label="Select Certificate(s)")
    ],
    outputs=gr.HTML(label="Filtered Movies"),
    title="Semantic Movie Search & Filter",
    description="Search for movies semantically, then filter by rating, genre, and certificate."
)

iface.launch(debug=True)


* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


