# Semantic Search Function

In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer
from sklearn.metrics import DistanceMetric
import numpy as np
import gradio as gr

In [2]:
%time df = pl.scan_parquet('video-index.parquet')

CPU times: user 3.1 ms, sys: 1.01 ms, total: 4.11 ms
Wall time: 31.2 ms


In [3]:
model_name = 'all-MiniLM-L6-v2'
%time model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

CPU times: user 818 ms, sys: 534 ms, total: 1.35 s
Wall time: 4.12 s


In [4]:
dist_name = 'manhattan'
%time dist = DistanceMetric.get_metric(dist_name)

CPU times: user 113 µs, sys: 14 µs, total: 127 µs
Wall time: 243 µs


In [5]:
def returnSearchResults(query: str, index: pl.lazyframe.frame.LazyFrame) -> np.ndarray:
    """
        Function to return indexes of top search results
    """

    # embed query
    query_embedding = model.encode(query).reshape(1, -1)

    # Get column names without triggering schema resolution warning
    column_names = index.collect_schema().names()

    # compute distances between query and titles/transcripts
    dist_arr = (
        dist.pairwise(index.select(column_names[4:388]).collect(), query_embedding) +
        dist.pairwise(index.select(column_names[388:]).collect(), query_embedding)
    )

    # search paramaters
    threshold = 40 # eye balled threshold for manhatten distance
    top_k = 5

    # evaluate videos close to query based on threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten()<threshold).flatten()
    # keep top k closest videos
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()

    # return indexes of search results
    return idx_below_threshold[idx_sorted][:top_k]

In [13]:
query = "Generative AI"
idx_result = returnSearchResults(query, df)

print(df.select(['video_id', 'title']).collect()[idx_result])

shape: (5, 2)
┌─────────────┬─────────────────────────────────┐
│ video_id    ┆ title                           │
│ ---         ┆ ---                             │
│ str         ┆ str                             │
╞═════════════╪═════════════════════════════════╡
│ 2Axas1OvafQ ┆ DON’T study Gen AI #generative… │
│ 0iFEtnHyzE0 ┆ Fine-tuning EXPLAINED in 40 se… │
│ 4RAvJt3fWoI ┆ 3 Ways to Make a Custom AI Ass… │
│ r5qk3uIdkks ┆ What is #ai? — Simply Explaine… │
│ 0cf7vzM_dZ0 ┆ Prompt Engineering: How to Tri… │
└─────────────┴─────────────────────────────────┘


In [14]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)


{'title': ['DON’T study Gen AI #generativeai',
  'Fine-tuning EXPLAINED in 40 sec #generativeai',
  '3 Ways to Make a Custom AI Assistant | RAG, Tools, & Fine-tuning',
  'What is #ai? — Simply Explained',
  'Prompt Engineering: How to Trick AI into Solving Your Problems'],
 'video_id': ['2Axas1OvafQ',
  '0iFEtnHyzE0',
  '4RAvJt3fWoI',
  'r5qk3uIdkks',
  '0cf7vzM_dZ0']}

In [15]:
def pseudoSearchAPI(query: str):

    # return top 5 search results
    idx_result = returnSearchResults(query, df)
    response = df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

    return response


In [16]:
def formatResultText(title: str, video_id: str):

    text = markdown_text = f"""<br> <br>
# {title}<br>

🔗 [Video Link](https://youtu.be/{video_id})"""

    return text

In [10]:
def formatVideoEmbed(video_id: str):

    # other options
    # embed = '<iframe width="640" height="360" src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" </iframe>'
    # embed = '<a href="https://youtu.be/'+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    # embed = '<a href="www.youtube.com/watch?v='+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'

    return '<iframe width="576" height="324" src="https://www.youtube.com/embed/'+ video_id +'"></iframe>'


In [11]:
def searchResults(query):
    # pseudo API call
    response = pseudoSearchAPI(query)

    # format search results

    # initialize list of outputs
    output_list = []

    # compute number of null search results (out of 5)
    num_empty_results = 5-len(response['title'])

    # display search results
    for i in range(len(response['title'])):
        video_id = response['video_id'][i]
        title = response['title'][i]

        embed = gr.HTML(value = formatVideoEmbed(video_id), visible=True)
        text = gr.Markdown(value = formatResultText(title, video_id), visible=True)

        output_list.append(embed)
        output_list.append(text)

    # make null search result slots invisible
    for i in range(num_empty_results):

        # if no search results display "No results." text
        if num_empty_results==5 and i==0:
            embed = gr.HTML(visible=False)
            text = gr.Markdown(value = "No results. Try rephrasing your query.", visible=True)

            output_list.append(embed)
            output_list.append(text)
            continue

        embed = gr.HTML(visible=False)
        text = gr.Markdown(visible=False)

        output_list.append(embed)
        output_list.append(text)

    return output_list


## Demo

In [12]:
# demo
output_list = []

with gr.Blocks() as demo:
    gr.Markdown("# YouTube Search")

    with gr.Row():
        inp = gr.Textbox(placeholder="What are you looking for?", label="Query", scale=3)
        btn = gr.Button("Search")
        btn.click(fn=searchResults, inputs=inp, outputs=output_list)

    for i in range(5):
        with gr.Row():
            output_list.append(gr.HTML())
            output_list.append(gr.Markdown())

    inp.submit(fn=searchResults, inputs=inp, outputs=output_list)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b2806f0f43bd32fa5a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


