# Build Embeddings

## Dataset

First we need to download the YT transcriptions dataset:

In [1]:
from datasets import load_dataset  # !pip install datasets

yt_transcriptions_ds = load_dataset(
    "jamescalam/youtube-transcriptions",
    split="train",
  	revision="8dca835"
)
yt_transcriptions_ds

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset json (/home/codespace/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-af3de6eb3e148fe9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


Dataset({
    features: ['title', 'visibility', 'published', 'url', 'id', 'text', 'start', 'end'],
    num_rows: 27214
})

In [2]:
yt_transcriptions_ds[0]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'visibility': 'Public',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'id': '35Pdoyi6ZoQ-t0.0',
 'text': 'Hi, welcome to the video.',
 'start': 0.0,
 'end': 9.36}

In [3]:
yt_transcriptions_ds[1]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'visibility': 'Public',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'id': '35Pdoyi6ZoQ-t3.0',
 'text': 'So this is the fourth video in a Transformers',
 'start': 3.0,
 'end': 11.56}

The sentences are all quite short at the moment, we need to merge them to create better chunks of text containing more meaning.

In [4]:
from tqdm.auto import tqdm
new_data = []

window = 6  # number of sentences to combine
stride = 3  # number of sentences to 'stride' over, used to create overlap
url_counter_part = 0

for i in tqdm(range(0, len(yt_transcriptions_ds), stride)):
    i_end = min(len(yt_transcriptions_ds)-1, i+window)

    if yt_transcriptions_ds[i]['title'] != yt_transcriptions_ds[i_end]['title']:
        url_counter_part = 0
        # in this case we skip this entry as we have start/end of two videos
        continue
    
    url_id = yt_transcriptions_ds[i]['url'].split('/')[-1]
    segment_id = f"{url_id}.{url_counter_part}"
    url_counter_part += 1

    text = ' '.join(yt_transcriptions_ds[i:i_end]['text'])
    new_data.append({
        'start': yt_transcriptions_ds[i]['start'],
        'end': yt_transcriptions_ds[i_end]['end'],
        'title': yt_transcriptions_ds[i]['title'],
        'text': text,
        'id': segment_id,
        'url': yt_transcriptions_ds[i]['url'],
        'published': yt_transcriptions_ds[i]['published']
    })

100%|██████████| 9072/9072 [00:08<00:00, 1083.87it/s]


In [5]:
new_data[0]

{'start': 0.0,
 'end': 25.76,
 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'text': "Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data.",
 'id': '35Pdoyi6ZoQ.0',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'published': '2021-07-06 13:00:03 UTC'}

In [6]:
new_data[1]

{'start': 11.56,
 'end': 35.96,
 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'text': "So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data. We built a tokenizer with it. And then we've set up our input pipeline ready to begin actually training our model, which",
 'id': '35Pdoyi6ZoQ.1',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'published': '2021-07-06 13:00:03 UTC'}

In [7]:
new_data[499]

{'start': 1090.0,
 'end': 1106.0,
 'title': 'Training BERT #4 - Train With Next Sentence Prediction (NSP)',
 'text': "So let's expand that out a little bit. So we'll go with token type IDs. Let's go with number 0.",
 'id': 'x1lAcT3xl5M.176',
 'url': 'https://youtu.be/x1lAcT3xl5M',
 'published': '2021-05-27 16:15:39 UTC'}

In [8]:
new_data[500]

{'start': 1096.0,
 'end': 1112.0,
 'title': 'Training BERT #4 - Train With Next Sentence Prediction (NSP)',
 'text': "token type IDs. Let's go with number 0. Okay. So now we see okay the reason is because they're in the middle here.",
 'id': 'x1lAcT3xl5M.177',
 'url': 'https://youtu.be/x1lAcT3xl5M',
 'published': '2021-05-27 16:15:39 UTC'}

## Initialize Embedding Model

In [9]:
import pinecone
import os

index_id = "youtube-search"

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)

# if index_id not in pinecone.list_indexes():
#     pinecone.create_index(
#         index_id,
#         dim,
#         metric="dotproduct"
#     )

index = pinecone.Index(index_id)

In [10]:
stats = index.describe_index_stats().to_dict()
print(stats)

if stats["total_vector_count"] > 0:
    index.delete(deleteAll='true')
    stats = index.describe_index_stats().to_dict()
    print(stats)

{'namespaces': {}, 'dimension': 1536, 'index_fullness': 0.0, 'total_vector_count': 0}


Now let's begin building the embeddings...

In [11]:
new_data[0:2]

[{'start': 0.0,
  'end': 25.76,
  'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
  'text': "Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data.",
  'id': '35Pdoyi6ZoQ.0',
  'url': 'https://youtu.be/35Pdoyi6ZoQ',
  'published': '2021-07-06 13:00:03 UTC'},
 {'start': 11.56,
  'end': 35.96,
  'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
  'text': "So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data. We built a tokenizer with it. And then we've set up our input pipeline ready to begin actually training our model, which",
  'id': '35Pdoyi6ZoQ.1',
  'url': 'https://youtu.be/35Pdoyi6ZoQ',
  'published': '2021-07-06 13:00:03 UTC'}]

In [12]:
from tqdm.auto import tqdm
import openai

# we encode and insert in batches of 64
batch_size = 64

# loop through in batches of 64
for i in tqdm(range(0, len(new_data), batch_size)):
    # find end position of batch (for when we hit end of data)
    i_end = min(len(new_data)-1, i+batch_size)
    # extract the metadata like text, start/end positions, etc
    batch_meta = [{
        "text": new_data[x]["text"],
        "start": new_data[x]["start"],
        "end": new_data[x]["end"],
        "url": new_data[x]["url"],
        "title": new_data[x]["title"]
    } for x in range(i, i_end)]
    # extract only text to be encoded by embedding model
    batch_text = [row['text'] for row in new_data[i:i_end]]
    # create the embedding vectors
    res = openai.Embedding.create(input=batch_text, engine="text-embedding-ada-002")
    batch_embeds = [record['embedding'] for record in res['data']]
    # extract IDs to be attached to each embedding and metadata
    batch_ids = [row['id'] for row in new_data[i:i_end]]
    # 'upsert' (eg insert) IDs, embeddings, and metadata to index
    to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
    index.upsert(to_upsert)

100%|██████████| 139/139 [01:34<00:00,  1.47it/s]


In [13]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8857}},
 'total_vector_count': 8857}

In [14]:
query = "what is OpenAI's CLIP?"

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
query_vectors = [embeddings.embed_query(query)]

# xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']
# index.query(vector=xq, top_k=5, include_metadata=True)

index.query(queries=query_vectors, top_k=5, include_metadata=True)

{'results': [{'matches': [{'id': 'fGwH2YoQkDM.0',
                           'metadata': {'end': 77.2,
                                        'start': 17.36,
                                        'text': 'is Clip which was built and '
                                                'trained by OpenAI. Now Clip '
                                                'is open source and it has '
                                                'been around for a little '
                                                "while. It's been around from "
                                                'since the start of 2021 but '
                                                "in the past few months we've "
                                                'seen the adoption of Clip '
                                                'grow at a pretty insane rate. '
                                                'It has found uses in a load '
                                                'of th