## Compare Candidate Search Approaches

### Imports

In [1]:
import polars as pl
import torch
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import DistanceMetric
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


### Load Data

video-transcripts --> original data
eval-data --> self produced query data

In [2]:
df = pl.read_parquet('data/video-transcripts.parquet')
df_eval = pl.read_csv('data/eval_data.csv', encoding="utf8-lossy")
df.shape
df.head()

video_id,datetime,title,transcript
str,datetime[μs],str,str
"""qPN_XZcJf_s""",2025-05-05 04:01:03,"""Reinforcement Learning with Hu…","""If you tell me what you like a…"
"""DVGmsnxB2UQ""",2025-04-14 04:00:27,"""Reinforcement Learning with Ne…","""if you make a guess and you ma…"
"""9hbQieQh7-o""",2025-04-07 04:00:17,"""Reinforcement Learning with Ne…","""When you don't know, take a gu…"
"""Z-T0iJEXiwM""",2025-03-31 04:00:25,"""Reinforcement Learning: Essent…","""reinforcement learning it's ju…"
"""_kstkMF-lQQ""",2025-02-12 14:20:19,"""StatQuest on DeepLearning.AI!!…","""the encoder model was used as …"


In [3]:
df_eval.shape
df_eval.head()

video_id,query
str,str
"""qPN_XZcJf_s""","""How does reinforcement learnin…"
"""DVGmsnxB2UQ""","""What are the math foundations …"
"""9hbQieQh7-o""","""Core ideas for reinforcement l…"
"""Z-T0iJEXiwM""","""reinforcement learning in simp…"
"""_kstkMF-lQQ""","""Quick and clear explanation of…"


### Embed Titles and Transcripts

creating embeddings with column and model(df frame)

In [4]:
# define "parameters"
column_to_embed_list = ['title', 'transcript']
model_name_list = ["all-MiniLM-L6-v2", "multi-qa-distilbert-cos-v1", "multi-qa-mpnet-base-dot-v1"]

In [5]:
# generate embeddings for each combination of column and model

# initialize dict to keep track of all text embeddings
text_embedding_dict = {}

for model_name in model_name_list:

    #define embedding model
    model = SentenceTransformer(model_name) 

    for column_name in column_to_embed_list:

        # define text embedding identifier
        key_name = model_name + "_" + column_name
        print(key_name)

        # generate embeddings for text under column_name
        %time embedding_arr = model.encode(df[column_name].to_list())
        print('')

        # append embeddings to dict
        text_embedding_dict[key_name] = embedding_arr

all-MiniLM-L6-v2_title
CPU times: total: 3.61 s
Wall time: 704 ms

all-MiniLM-L6-v2_transcript
CPU times: total: 48.1 s
Wall time: 7.97 s

multi-qa-distilbert-cos-v1_title
CPU times: total: 11.8 s
Wall time: 2.2 s

multi-qa-distilbert-cos-v1_transcript
CPU times: total: 4min 49s
Wall time: 49.3 s

multi-qa-mpnet-base-dot-v1_title
CPU times: total: 25 s
Wall time: 4.67 s

multi-qa-mpnet-base-dot-v1_transcript
CPU times: total: 11min 28s
Wall time: 1min 56s



In [6]:
embedding_arr.shape

(269, 768)

In [7]:
text_embedding_dict

{'all-MiniLM-L6-v2_title': array([[-0.04577294, -0.03536161,  0.01674047, ...,  0.08382568,
          0.01537051,  0.04929987],
        [-0.08249424, -0.03172299,  0.05575989, ...,  0.00093511,
         -0.07605799, -0.01927041],
        [-0.07484752, -0.06010093,  0.03214505, ...,  0.01209698,
         -0.0929385 ,  0.00065229],
        ...,
        [ 0.03343456,  0.04637399, -0.07690614, ...,  0.10551995,
          0.02082146, -0.00478728],
        [-0.0994635 , -0.03978509, -0.03035961, ..., -0.03320049,
          0.04437448,  0.04836575],
        [-0.02012799, -0.02282261,  0.02265325, ..., -0.07128242,
          0.05690396,  0.0577021 ]], shape=(269, 384), dtype=float32),
 'all-MiniLM-L6-v2_transcript': array([[-0.06302463, -0.12518334,  0.04661809, ...,  0.09399724,
         -0.0851324 ,  0.04195585],
        [-0.09804534, -0.07821772, -0.00983041, ...,  0.06579232,
         -0.0460928 , -0.02563621],
        [-0.08799476, -0.15343395,  0.00779813, ...,  0.08505448,
         -0.0

In [9]:
text_embedding_dict['all-MiniLM-L6-v2_title'].shape # that changes for each of the model

(269, 384)

### Embed Queries

creating embeddings with query and model(df_eval frame)

In [10]:
query_embedding_dict = {}

for model_name in model_name_list:

    #define embedding model
    model = SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embedding_arr = model.encode(df_eval['query'].to_list())
    print('')

    # append embedding to dict
    query_embedding_dict[model_name] = embedding_arr

all-MiniLM-L6-v2
CPU times: total: 2.77 s
Wall time: 469 ms

multi-qa-distilbert-cos-v1
CPU times: total: 8.05 s
Wall time: 1.37 s

multi-qa-mpnet-base-dot-v1
CPU times: total: 16.8 s
Wall time: 2.84 s



### Evaluate Search Methods

In [11]:
def returnVideoID_index(df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame, query_n: int) -> int:
    """
        Function to return the index of a dataframe corresponding to the nth row in evaluation dataframe
    """

    return [i for i in range(len(df)) if df['video_id'][i]==df_eval['video_id'][query_n]][0]

In [12]:
def evalTrueRankings(dist_arr_isorted: np.ndarray, df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame) -> np.ndarray:
    """
        Function to return "true" video ID rankings for each evaluation query
    """
    
    # intialize array to store rankings of "correct" search result
    true_rank_arr = np.empty((1, dist_arr_isorted.shape[1]))
    
    # evaluate ranking of correct result for each query
    for query_n in range(dist_arr_isorted.shape[1]):
    
        # return "true" video ID's in df
        video_id_idx = returnVideoID_index(df, df_eval, query_n)
        
        # evaluate the ranking of the "true" video ID
        true_rank = np.argwhere(dist_arr_isorted[:,query_n]==video_id_idx)[0][0]
        
        # store the "true" video ID's ranking in array
        true_rank_arr[0,query_n] = true_rank

    return true_rank_arr

In [13]:
# initialize distance metrics to experiment
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

In [None]:
# evaluate all possible combinations of model, columns to embed, and distance metrics
# model:3, column: 3, matrics: 5 = 45
# initialize list to store results
eval_results = []

# loop through all models
for model_name in model_name_list:

    # generate query embedding(df_eval)
    query_embedding = query_embedding_dict[model_name]
    
    # loop through text columns(df)
    for column_name in column_to_embed_list:

        # generate column embedding
        embedding_arr = text_embedding_dict[model_name+'_'+column_name]

        # loop through distance metrics
        for dist_name in dist_name_list:

            # compute distance between video text and query
            dist = DistanceMetric.get_metric(dist_name)
            dist_arr = dist.pairwise(embedding_arr, query_embedding)

            # sort indexes of distance array
            dist_arr_isorted = np.argsort(dist_arr, axis=0)

            # define label for search method
            method_name = "_".join([model_name, column_name, dist_name])

            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)

        # loop through sbert similarity scores
        for sim_name in sim_name_list:
            # apply similarity score from sbert
            cmd = "dist_arr = -util." + sim_name + "(embedding_arr, query_embedding)"
            exec(cmd)
    
            # sort indexes of distance array (notice minus sign in front of cosine similarity)
            dist_arr_isorted = np.argsort(dist_arr, axis=0)
    
            # define label for search method
            method_name = "_".join([model_name, column_name, sim_name.replace("_","-")])
    
            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)
    
            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)

In [15]:
cmd

'dist_arr = -util.dot_score(embedding_arr, query_embedding)'

In [16]:
dist_arr.shape # 1. df row, 2. df_eval row
dist_arr[0,2]

tensor(-22.2477)

In [17]:
true_rank_arr

array([[  0.,   1.,   0.,   0.,   0.,   4.,   0.,   0.,   0.,   1.,   0.,
          2.,   0.,   8.,   0.,   0.,   0.,   0.,  35.,  36.,   0.,   0.,
          0.,   0.,   3.,   0.,   0.,   4.,   1.,   0.,   0.,   0.,   0.,
          0.,   0.,   1.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   0.,
          0.,   0.,   1.,  46.,   5.,   0.,   1.,   0.,   4.,   1.,   0.,
          0.,   3.,   1.,   0.,   0.,   6.,   1.,   0.,   0.,  64.,   2.,
          2.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,
          0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,
          0.,   0.,   1.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   2.,   2.,   0., 253.,  28.,   0.,   0.,   0.,
          9.,   1.,   1.,   1.,   0.,   3.,   0.,   0.,   1., 249.,  13.,
          0.,   1., 177.,   2.,   1.,   0.,   0.,   0.,   1.,   0.,   0.,
          0.,   0.,   0.,   1.,   2.,   0.,   3.,   0.,   2.,   0.,   0.,
          0.,   0.,   0.,   0.,   0., 

In [18]:
# compute rankings for title + transcripts embedding
for model_name in model_name_list:
    
    # generate embeddings
    embedding_arr1 = text_embedding_dict[model_name+'_title']
    embedding_arr2 = text_embedding_dict[model_name+'_transcript']
    query_embedding = query_embedding_dict[model_name]

    for dist_name in dist_name_list:

        # compute distance between video text(title + transcript) and query
        dist = DistanceMetric.get_metric(dist_name)
        dist_arr = dist.pairwise(embedding_arr1, query_embedding) + dist.pairwise(embedding_arr2, query_embedding)

        # sort indexes of distance array
        dist_arr_isorted = np.argsort(dist_arr, axis=0)

         # define label for search method
        method_name = "_".join([model_name, "title-transcript", dist_name])

        # evaluate the ranking of the ground truth
        true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

        # store results
        eval_list = [method_name] + true_rank_arr.tolist()[0]
        eval_results.append(eval_list)

    # loop through sbert similarity scores
    for sim_name in sim_name_list:
        # apply similarity score from sbert
        cmd = "dist_arr = -util." + sim_name + "(embedding_arr1, query_embedding) - util."+ sim_name + "(embedding_arr2, query_embedding)"
        exec(cmd)

        # sort indexes of distance array (notice minus sign in front of cosine similarity)
        dist_arr_isorted = np.argsort(dist_arr, axis=0)

        # define label for search method
        method_name = "_".join([model_name, "title-transcript", sim_name.replace("_","-")])

        # evaluate the ranking of the ground truth
        true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

        # store results
        eval_list = [method_name] + true_rank_arr.tolist()[0]
        eval_results.append(eval_list)

In [19]:
len(eval_results)

45

In [20]:
# define schema for results dataframe
schema_dict = {'method_name':str}
for i in range(len(eval_results[0])-1):
    schema_dict['rank_query-'+str(i)] = float

# store results in dataframe
df_results = pl.DataFrame(eval_results, schema=schema_dict)
df_results.head()

  df_results = pl.DataFrame(eval_results, schema=schema_dict)


method_name,rank_query-0,rank_query-1,rank_query-2,rank_query-3,rank_query-4,rank_query-5,rank_query-6,rank_query-7,rank_query-8,rank_query-9,rank_query-10,rank_query-11,rank_query-12,rank_query-13,rank_query-14,rank_query-15,rank_query-16,rank_query-17,rank_query-18,rank_query-19,rank_query-20,rank_query-21,rank_query-22,rank_query-23,rank_query-24,rank_query-25,rank_query-26,rank_query-27,rank_query-28,rank_query-29,rank_query-30,rank_query-31,rank_query-32,rank_query-33,rank_query-34,rank_query-35,…,rank_query-175,rank_query-176,rank_query-177,rank_query-178,rank_query-179,rank_query-180,rank_query-181,rank_query-182,rank_query-183,rank_query-184,rank_query-185,rank_query-186,rank_query-187,rank_query-188,rank_query-189,rank_query-190,rank_query-191,rank_query-192,rank_query-193,rank_query-194,rank_query-195,rank_query-196,rank_query-197,rank_query-198,rank_query-199,rank_query-200,rank_query-201,rank_query-202,rank_query-203,rank_query-204,rank_query-205,rank_query-206,rank_query-207,rank_query-208,rank_query-209,rank_query-210,rank_query-211
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""all-MiniLM-L6-v2_title_euclide…",0.0,0.0,0.0,0.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,…,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
"""all-MiniLM-L6-v2_title_manhatt…",0.0,0.0,0.0,0.0,19.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,…,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
"""all-MiniLM-L6-v2_title_chebysh…",0.0,0.0,0.0,2.0,30.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,146.0,62.0,0.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,…,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.0
"""all-MiniLM-L6-v2_title_cos-sim""",0.0,0.0,0.0,0.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,…,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
"""all-MiniLM-L6-v2_title_dot-sco…",0.0,0.0,0.0,0.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,…,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [21]:
# compute mean rankings of ground truth search result
df_results = df_results.with_columns(new_col=pl.mean_horizontal(df_results.columns[1:])).rename({"new_col": "rank_query-mean"})

In [22]:
# compute number of ground truth results which appear in top 3
for i in [1,3]:
    df_results = df_results.with_columns(new_col=pl.sum_horizontal(df_results[:,1:-1]<i)).rename({"new_col": "num_in_top-"+str(i)})

### Top Results

In [23]:
df_summary = df_results[['method_name', "rank_query-mean", "num_in_top-1", "num_in_top-3"]]

In [24]:
print(df_summary.sort('rank_query-mean').head())

shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬──────────────┬──────────────┐
│ method_name                     ┆ rank_query-mean ┆ num_in_top-1 ┆ num_in_top-3 │
│ ---                             ┆ ---             ┆ ---          ┆ ---          │
│ str                             ┆ f64             ┆ u32          ┆ u32          │
╞═════════════════════════════════╪═════════════════╪══════════════╪══════════════╡
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.268868        ┆ 182          ┆ 208          │
│ multi-qa-mpnet-base-dot-v1_tit… ┆ 0.292453        ┆ 178          ┆ 207          │
└─────────────────────────────────┴─────────────────┴──────────────┴──────────────┘


In [25]:
df_summary.sort('rank_query-mean').head()[0,0]

'multi-qa-distilbert-cos-v1_title_euclidean'

In [26]:
print(df_summary.sort("num_in_top-1", descending=True).head())

shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬──────────────┬──────────────┐
│ method_name                     ┆ rank_query-mean ┆ num_in_top-1 ┆ num_in_top-3 │
│ ---                             ┆ ---             ┆ ---          ┆ ---          │
│ str                             ┆ f64             ┆ u32          ┆ u32          │
╞═════════════════════════════════╪═════════════════╪══════════════╪══════════════╡
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.264151        ┆ 183          ┆ 207          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.268868        ┆ 182          ┆ 208          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.358491        ┆ 181          ┆ 206          │
└─────────────────────────────────┴─────────────────┴──────────────┴──────────────┘


In [27]:
df_summary.sort("num_in_top-1", descending=True).head()[0,0]

'multi-qa-distilbert-cos-v1_title_euclidean'

In [28]:
print(df_summary.sort("num_in_top-3", descending=True).head())

shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬──────────────┬──────────────┐
│ method_name                     ┆ rank_query-mean ┆ num_in_top-1 ┆ num_in_top-3 │
│ ---                             ┆ ---             ┆ ---          ┆ ---          │
│ str                             ┆ f64             ┆ u32          ┆ u32          │
╞═════════════════════════════════╪═════════════════╪══════════════╪══════════════╡
│ all-MiniLM-L6-v2_title_manhatt… ┆ 0.334906        ┆ 179          ┆ 208          │
│ multi-qa-distilbert-cos-v1_tit… ┆ 0.268868        ┆ 182          ┆ 208          │
│ all-MiniLM-L6-v2_title_euclide… ┆ 0.334906        ┆ 177          ┆ 207          │
│ all-MiniLM-L6-v2_title_cos-sim  ┆ 0.334906        ┆ 177          ┆ 207          │
│ all-MiniLM-L6-v2_title_dot-sco… ┆ 0.334906        ┆ 177          ┆ 207          │
└─────────────────────────────────┴─────────────────┴──────────────┴──────────────┘


In [29]:
df_summary.sort("num_in_top-3", descending=True).head()[0,0]

'all-MiniLM-L6-v2_title_manhattan'

In [30]:
for i in range(4):
    print(df_summary.sort("num_in_top-3", descending=True)['method_name'][i])

all-MiniLM-L6-v2_title_manhattan
multi-qa-distilbert-cos-v1_title_manhattan
all-MiniLM-L6-v2_title_euclidean
all-MiniLM-L6-v2_title_cos-sim
