# Create video index

Code authored by: Shaw Talebi <br>

Video link: https://youtu.be/6qCrvlHRhcM

### imports

In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer



### load data

In [2]:
df = pl.read_parquet('data/video-transcripts.parquet')
df.head()

video_id,datetime,title,transcript
str,datetime[μs],str,str
"""03x2oYg9oME""",2024-04-25 15:16:00,"""Data Science P…","""this video is …"
"""O5i_mMUM94c""",2024-04-19 14:05:54,"""How I’d learne…","""here's how I'd…"
"""xm9devSQEqU""",2024-04-18 15:59:02,"""4 Skills You N…","""although it is…"
"""Z6CmuVEi7QY""",2024-04-11 10:00:27,"""How I'd Learn …","""when I was fir…"
"""INlCLmWlojY""",2024-04-04 18:45:00,"""I Was Wrong Ab…","""last year I qu…"


### embed titles and transcripts

In [3]:
model_name = 'all-MiniLM-L6-v2'
column_name_list = ['title', 'transcript']

In [4]:
model = SentenceTransformer(model_name)

for column_name in column_name_list:
    # generate embeddings
    embedding_arr = model.encode(df[column_name].to_list())

    # store embeddings in a dataframe
    schema_dict = {column_name+'_embedding-'+str(i): float for i in range(embedding_arr.shape[1])}
    df_embedding = pl.DataFrame(embedding_arr, schema=schema_dict)

    # append embeddings to video index
    df = pl.concat([df, df_embedding], how='horizontal')

In [8]:
df.shape

(83, 772)

In [5]:
df.head()

video_id,datetime,title,transcript,title_embedding-0,title_embedding-1,title_embedding-2,title_embedding-3,title_embedding-4,title_embedding-5,title_embedding-6,title_embedding-7,title_embedding-8,title_embedding-9,title_embedding-10,title_embedding-11,title_embedding-12,title_embedding-13,title_embedding-14,title_embedding-15,title_embedding-16,title_embedding-17,title_embedding-18,title_embedding-19,title_embedding-20,title_embedding-21,title_embedding-22,title_embedding-23,title_embedding-24,title_embedding-25,title_embedding-26,title_embedding-27,title_embedding-28,title_embedding-29,title_embedding-30,title_embedding-31,title_embedding-32,…,transcript_embedding-347,transcript_embedding-348,transcript_embedding-349,transcript_embedding-350,transcript_embedding-351,transcript_embedding-352,transcript_embedding-353,transcript_embedding-354,transcript_embedding-355,transcript_embedding-356,transcript_embedding-357,transcript_embedding-358,transcript_embedding-359,transcript_embedding-360,transcript_embedding-361,transcript_embedding-362,transcript_embedding-363,transcript_embedding-364,transcript_embedding-365,transcript_embedding-366,transcript_embedding-367,transcript_embedding-368,transcript_embedding-369,transcript_embedding-370,transcript_embedding-371,transcript_embedding-372,transcript_embedding-373,transcript_embedding-374,transcript_embedding-375,transcript_embedding-376,transcript_embedding-377,transcript_embedding-378,transcript_embedding-379,transcript_embedding-380,transcript_embedding-381,transcript_embedding-382,transcript_embedding-383
str,datetime[μs],str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""03x2oYg9oME""",2024-04-25 15:16:00,"""Data Science P…","""this video is …",-0.015707,-0.000803,-0.000592,0.014483,0.054659,-0.077496,-0.002334,-0.037766,-0.076303,0.095494,-0.023669,-0.065215,0.079585,-0.008432,-0.049648,0.090498,-0.083202,-0.051613,0.008807,-0.082336,-0.02177,-0.031772,-0.010281,0.043854,-0.042952,0.085398,0.053058,0.056221,-0.006999,0.011987,-0.025156,0.063567,0.051586,…,0.081404,-0.00473,-0.019275,0.015354,0.01367,-0.048947,0.014555,0.058803,-0.07865,0.039052,0.018853,0.030402,0.010164,-0.01578,0.028731,0.039881,0.107271,-0.086652,0.068573,-0.128907,0.000532,-7.4e-05,-0.030465,-0.041026,0.044691,-0.009989,0.077964,0.073521,0.09619,0.040807,-0.009223,-0.074324,0.063215,0.017714,0.006442,-0.029179,-0.017104
"""O5i_mMUM94c""",2024-04-19 14:05:54,"""How I’d learne…","""here's how I'd…",-0.012019,-0.064174,0.070952,0.070263,0.058338,-0.06399,0.090048,-0.018103,-0.030347,0.045006,0.028399,-0.050822,0.03355,0.021837,-0.054941,0.031815,-0.046915,-0.026955,-0.030183,-0.031991,0.006332,0.03895,0.060244,0.002082,0.026674,0.129466,-0.003821,-0.035118,0.048048,-0.034193,0.00103,0.066846,0.035806,…,0.041583,-0.014911,-0.010122,-0.045581,-0.045081,-0.0292,-0.0444,0.030496,-0.059147,0.060487,-0.057393,-0.016889,0.000606,-0.056332,0.05195,0.050159,0.032203,-0.130664,0.076355,-0.093234,0.004911,0.024305,0.036537,-0.028809,-0.008713,-0.044622,-0.007744,0.113142,0.059153,0.001777,0.033651,-0.106226,0.103233,-0.017869,0.051937,-0.127236,0.036246
"""xm9devSQEqU""",2024-04-18 15:59:02,"""4 Skills You N…","""although it is…",0.018166,-0.090621,-0.012552,0.020742,-0.093658,-0.107941,0.005711,0.009971,-0.104353,0.022222,-0.064549,-0.097381,-0.042711,0.019615,-0.034009,0.100303,-0.061772,0.003508,-0.03378,-0.13128,-0.017863,-0.057626,0.015556,-0.072058,-0.010606,0.056719,0.013839,-0.041414,0.017813,-0.005246,0.029335,-0.02157,0.042233,…,0.031496,-0.040242,0.011707,-0.007683,0.008842,0.012777,-0.005904,0.104284,-0.009653,0.082479,-0.040231,-0.030715,0.048085,0.003187,0.087079,-0.013218,0.069204,-0.050548,0.015828,-0.04139,-0.011192,0.034093,0.049186,0.048458,0.004615,0.032301,0.075555,0.070256,0.066454,0.043701,-0.01968,-0.036145,0.09853,0.026589,0.036639,0.009958,-0.012691
"""Z6CmuVEi7QY""",2024-04-11 10:00:27,"""How I'd Learn …","""when I was fir…",-0.009383,-0.081226,0.009785,0.069336,-0.033242,-0.140843,0.033153,-0.012829,-0.079523,0.074586,-0.038719,-0.01216,0.02971,-0.052873,-0.052328,0.032524,-0.041563,-0.006607,-0.011682,-0.052671,-0.021066,-0.021317,-0.006413,-0.03587,0.017534,0.13395,0.069684,0.045327,0.011075,-0.0294,-0.060006,-0.041755,-0.034357,…,0.022374,0.015603,-0.003612,-0.006062,-0.040954,-0.024001,-0.065596,0.01797,-0.101836,0.087055,-0.059411,-0.025912,0.020501,-0.027434,0.060503,0.068892,0.029335,-0.089525,0.072315,-0.052258,0.031635,0.004499,0.046369,-0.005005,0.000506,-0.001204,0.008005,0.092965,0.054399,-0.005864,0.060123,-0.101077,0.124584,-0.010773,0.012077,-0.059352,0.003125
"""INlCLmWlojY""",2024-04-04 18:45:00,"""I Was Wrong Ab…","""last year I qu…",-0.009162,-0.053555,-0.036134,-0.016168,-0.046887,-0.025066,0.038359,0.031727,0.003625,0.009598,-0.017576,0.071134,0.022244,0.014256,-0.01466,0.047682,0.010775,-0.03238,-0.00384,-0.077088,-0.099166,0.046459,-0.020923,-0.053066,-0.020125,0.033315,0.027345,-0.084659,0.014071,0.007794,-0.005639,0.092778,0.040852,…,0.103895,-0.031368,-0.034569,0.004799,-0.05771,-0.001767,-0.055002,0.020011,-0.166268,0.101994,0.020444,0.003303,0.052046,0.02083,0.064734,0.061822,0.067726,-0.058953,0.059298,-0.060392,-0.033443,0.017735,0.035072,0.039556,-0.019598,-0.016372,0.031005,0.027666,-0.009094,-0.035771,0.069946,-0.188073,0.101185,0.035176,-0.070324,-0.096129,-0.019654


### save index to file

In [6]:
df.write_parquet('data/video-index.parquet')