# Create video index

### Imports

In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Load Data

In [None]:
df = pl.read_parquet('data/video-transcripts.parquet')
df.head()

video_id,datetime,title,transcript
str,datetime[μs],str,str
"""qPN_XZcJf_s""",2025-05-05 04:01:03,"""Reinforcement Learning with Hu…","""If you tell me what you like a…"
"""DVGmsnxB2UQ""",2025-04-14 04:00:27,"""Reinforcement Learning with Ne…","""if you make a guess and you ma…"
"""9hbQieQh7-o""",2025-04-07 04:00:17,"""Reinforcement Learning with Ne…","""When you don't know, take a gu…"
"""Z-T0iJEXiwM""",2025-03-31 04:00:25,"""Reinforcement Learning: Essent…","""reinforcement learning it's ju…"
"""_kstkMF-lQQ""",2025-02-12 14:20:19,"""StatQuest on DeepLearning.AI!!…","""the encoder model was used as …"


### Embed titles and transcripts (even data in eval might not represent real life scenario)

In [3]:
model_name = 'all-MiniLM-L6-v2'
column_name_list = ['title', 'transcript']

In [4]:
model = SentenceTransformer(model_name)

for column_name in column_name_list:
    # generate embeddings
    embedding_arr = model.encode(df[column_name].to_list())

    # store embeddings in a dataframe
    schema_dict = {column_name+'_embedding-'+str(i): float for i in range(embedding_arr.shape[1])}
    df_embedding = pl.DataFrame(embedding_arr, schema=schema_dict)

    # append embeddings to video index
    df = pl.concat([df, df_embedding], how='horizontal')

In [5]:
df.shape

(269, 772)

In [6]:
df.head()

video_id,datetime,title,transcript,title_embedding-0,title_embedding-1,title_embedding-2,title_embedding-3,title_embedding-4,title_embedding-5,title_embedding-6,title_embedding-7,title_embedding-8,title_embedding-9,title_embedding-10,title_embedding-11,title_embedding-12,title_embedding-13,title_embedding-14,title_embedding-15,title_embedding-16,title_embedding-17,title_embedding-18,title_embedding-19,title_embedding-20,title_embedding-21,title_embedding-22,title_embedding-23,title_embedding-24,title_embedding-25,title_embedding-26,title_embedding-27,title_embedding-28,title_embedding-29,title_embedding-30,title_embedding-31,title_embedding-32,…,transcript_embedding-347,transcript_embedding-348,transcript_embedding-349,transcript_embedding-350,transcript_embedding-351,transcript_embedding-352,transcript_embedding-353,transcript_embedding-354,transcript_embedding-355,transcript_embedding-356,transcript_embedding-357,transcript_embedding-358,transcript_embedding-359,transcript_embedding-360,transcript_embedding-361,transcript_embedding-362,transcript_embedding-363,transcript_embedding-364,transcript_embedding-365,transcript_embedding-366,transcript_embedding-367,transcript_embedding-368,transcript_embedding-369,transcript_embedding-370,transcript_embedding-371,transcript_embedding-372,transcript_embedding-373,transcript_embedding-374,transcript_embedding-375,transcript_embedding-376,transcript_embedding-377,transcript_embedding-378,transcript_embedding-379,transcript_embedding-380,transcript_embedding-381,transcript_embedding-382,transcript_embedding-383
str,datetime[μs],str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""qPN_XZcJf_s""",2025-05-05 04:01:03,"""Reinforcement Learning with Hu…","""If you tell me what you like a…",-0.045773,-0.035362,0.01674,0.022222,0.030434,0.036626,0.019937,-0.023566,0.037335,0.069336,-0.020144,0.021346,0.070331,-0.021366,-0.079474,-0.02359,-0.018571,-0.009298,-0.108042,-0.06373,0.034273,-0.044438,0.019637,0.008817,-0.037196,0.049993,-0.075927,0.04082,0.040253,-0.067781,0.090154,0.051394,0.059853,…,0.049012,-0.024015,-0.047154,0.021765,-0.011637,-0.003636,-0.012746,0.079373,0.043273,0.089281,-0.101504,0.014149,-0.043612,0.014073,0.081181,0.035665,-0.062193,-0.018225,0.010093,-0.083076,-0.176627,0.011708,0.015269,0.005692,-0.035475,-0.04263,0.029167,0.071651,0.063172,-0.015163,-0.005227,-0.013139,0.074127,-0.038681,0.093997,-0.085132,0.041956
"""DVGmsnxB2UQ""",2025-04-14 04:00:27,"""Reinforcement Learning with Ne…","""if you make a guess and you ma…",-0.082494,-0.031723,0.05576,-0.005752,-0.060275,0.061898,0.00661,0.032123,-0.031807,0.010386,0.013829,0.047611,0.040107,0.006958,-0.10259,0.120285,-0.048499,0.004799,-0.085534,-0.026357,-0.013926,-0.046285,-0.010812,0.008536,0.048341,0.055112,0.005201,0.036438,0.007815,-0.011717,0.08255,-0.085784,0.058793,…,0.063242,0.023437,0.041904,0.073757,-0.018452,0.024691,0.017287,0.103366,0.043934,0.036895,-0.013792,0.007746,-0.064714,-0.079728,-0.033936,0.060437,-0.031876,-0.031068,0.013123,-0.088113,-0.068544,0.008913,-0.005257,0.045811,0.053224,-0.009161,0.071515,0.053964,-0.096705,-0.001729,-0.136912,-0.049363,0.07667,0.006123,0.065792,-0.046093,-0.025636
"""9hbQieQh7-o""",2025-04-07 04:00:17,"""Reinforcement Learning with Ne…","""When you don't know, take a gu…",-0.074848,-0.060101,0.032145,-0.033778,-0.034553,0.066329,0.087384,-0.010923,-0.030688,0.024141,-0.01432,0.04973,0.010701,-0.033269,-0.083447,0.016995,0.002283,-0.015884,-0.112428,-0.051657,0.013384,-0.061633,-0.016856,-0.018731,-0.024453,0.067897,0.001593,0.060971,0.041908,-0.02969,0.083843,-0.059907,0.051653,…,0.039565,0.050846,0.040488,0.082357,0.016336,0.005515,0.007982,0.086907,0.09247,0.040171,-0.047112,-0.054824,-0.031779,-0.038711,0.000213,0.058527,0.006905,-0.011886,-0.026403,-0.048213,-0.076524,0.024739,-0.059035,0.043971,-0.022064,-0.010944,0.065385,0.064695,-0.033358,-0.019673,-0.072128,-0.036346,0.11036,0.027279,0.085054,-0.081458,-0.05277
"""Z-T0iJEXiwM""",2025-03-31 04:00:25,"""Reinforcement Learning: Essent…","""reinforcement learning it's ju…",-0.015067,-0.004333,0.008825,-0.028864,-0.041984,0.028431,0.0733,-0.029926,-0.015934,0.063034,-0.002536,0.047227,0.014705,-0.018988,-0.001431,-0.024885,0.013991,0.020439,-0.110063,-0.104455,0.017395,-0.040076,-0.014649,-0.011022,-0.078105,0.06714,0.011409,0.068845,0.074359,-0.068755,0.067044,-0.003313,0.083593,…,-0.002136,0.032669,0.03905,0.029078,-0.061164,0.017348,-0.003463,0.140296,-0.054597,0.022996,-0.04742,0.016381,-0.029903,-0.109157,-0.033581,0.048115,0.031936,-0.077697,0.001789,-0.109485,0.008454,0.030693,0.02833,-0.025492,-0.058461,0.00634,0.010639,0.104263,-0.054356,-0.003619,-0.044489,0.045421,0.120995,0.043946,0.05697,-0.088908,0.011949
"""_kstkMF-lQQ""",2025-02-12 14:20:19,"""StatQuest on DeepLearning.AI!!…","""the encoder model was used as …",-0.018511,-0.118259,-0.017914,-0.004275,0.004172,0.007538,-0.051015,-0.014277,-0.068753,0.00051,-0.048604,-0.046289,-0.037247,-0.009776,-0.082343,0.001161,-0.005324,0.064697,-0.128199,-0.076982,0.020576,-0.004608,0.028616,-0.087685,0.053808,0.014983,0.06356,-0.066436,-0.015298,-0.04563,0.001485,0.060205,0.0494,…,0.110088,0.029156,0.014005,0.027127,0.073092,0.012903,-0.057203,0.022623,0.064149,-0.005043,-0.107489,-0.016272,0.043243,0.083965,0.045502,-0.050706,0.04154,0.094311,0.10044,0.015832,-0.048033,-0.017098,0.025265,0.017955,-0.052148,-0.055641,0.032853,-0.041743,0.033333,-0.024929,-0.029056,0.031726,-0.018474,-0.010247,0.084994,-0.059909,0.077756


### Import File

In [7]:
df.write_parquet('data/video-index.parquet')