# Create YouTube Title-Thumbnail Training Pairs

Code authored by: Shaw Talebi

[Video link](https://youtu.be/W4s6b2ZM6kI) | [Blog link](https://medium.com/towards-data-science/fine-tuning-multimodal-embedding-models-bf007b1c5da5) <br>
[Dataset](https://huggingface.co/datasets/shawhin/yt-title-thumbnail-pairs) | [Fine-tuned Model](https://huggingface.co/shawhin/clip-title-thumbnail-embeddings)

### imports

In [1]:
from top_secret import my_key
import requests
from isodate import parse_duration

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import DatasetDict, Dataset

### Extract

#### extract video ids

In [2]:
channel_id = 'UCa9gErQ9AE5jT2DZLjXBIdA' # my YouTube channel ID
page_token = None # initialize page token
url = 'https://www.googleapis.com/youtube/v3/search' # YouTube search API endpoint

# extract video data across multiple search result pages
video_id_list = []

while page_token != 0:
    params = {
        "key": my_key, 
        'channelId': channel_id, 
        'part': ["snippet","id"], 
        'order': "date", 
        'maxResults':50, 
        'pageToken': page_token
    }
    response = requests.get(url, params=params)

    for raw_item in dict(response.json())['items']:
        
        # only execute for youtube videos
        if raw_item['id']['kind'] != "youtube#video":
            continue

        # grab video ids
        video_id_list.append(raw_item['id']['videoId'])

    try:
        # grab next page token
        page_token = dict(response.json())['nextPageToken']
    except:
        # if no next page token kill while loop
        page_token = 0

In [3]:
len(video_id_list)

121

#### extract titles and thumbnail urls

In [4]:
url = "https://www.googleapis.com/youtube/v3/videos"

video_data_list = []

for video_id in video_id_list:

    params = {
        "part": ["snippet","contentDetails"],
        "id": video_id,  
        "key": my_key,  
    }
    response = requests.get(url, params=params)
    
    raw_dict = dict(response.json())['items'][0]

    # only process videos longer than 3 minutes
    iso_duration = raw_dict['contentDetails']["duration"]
    if parse_duration(iso_duration).total_seconds() < 180:
        continue
    
    # extract video data
    video_data = {}
    video_data['video_id'] = video_id
    video_data['title'] = raw_dict['snippet']['title']
    video_data['thumbnail_url'] = raw_dict['snippet']['thumbnails']['high']['url']

    # append data to list
    video_data_list.append(video_data)

In [5]:
len(video_data_list)

76

### Transform

#### create dataframe

In [6]:
df = pd.DataFrame(video_data_list)
df.head()

Unnamed: 0,video_id,title,thumbnail_url
0,hOLBrIjRAj4,Fine-Tuning Text Embeddings For Domain-specifi...,https://i.ytimg.com/vi/hOLBrIjRAj4/hqdefault.jpg
1,V1BR2tb_e8g,My AI Development Setup (From Scratch),https://i.ytimg.com/vi/V1BR2tb_e8g/hqdefault.jpg
2,R5WXaxmb6m4,How to Build a Resume Optimizer with AI (Code ...,https://i.ytimg.com/vi/R5WXaxmb6m4/hqdefault.jpg
3,e3p9-hYxwSQ,How I’d Learn AI in 2025 (if I could start over),https://i.ytimg.com/vi/e3p9-hYxwSQ/hqdefault.jpg
4,Y7pNmocrmi8,Multimodal RAG: A Beginner-friendly Guide (wit...,https://i.ytimg.com/vi/Y7pNmocrmi8/hqdefault.jpg


#### create negative pairs

In [7]:
# Load the model
model = SentenceTransformer("all-mpnet-base-v2")

In [8]:
%%time
# Encode all titles
job_embeddings = model.encode(df['title'].to_list())
print(job_embeddings.shape)

(76, 768)
CPU times: user 162 ms, sys: 45.4 ms, total: 208 ms
Wall time: 265 ms


In [9]:
# compute similarities
similarities = model.similarity(job_embeddings, job_embeddings)
print(similarities.shape)

torch.Size([76, 76])


In [10]:
# match least title least similar to positive match as the negative match
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

In [11]:
# add negative pairs to df
df['title_neg'] = df['title'].iloc[negative_pair_index_list].values

In [12]:
df.head()

Unnamed: 0,video_id,title,thumbnail_url,title_neg
0,hOLBrIjRAj4,Fine-Tuning Text Embeddings For Domain-specifi...,https://i.ytimg.com/vi/hOLBrIjRAj4/hqdefault.jpg,Why Conflict Is Good & How You Can Use It
1,V1BR2tb_e8g,My AI Development Setup (From Scratch),https://i.ytimg.com/vi/V1BR2tb_e8g/hqdefault.jpg,"Pareto, Power Laws, and Fat Tails"
2,R5WXaxmb6m4,How to Build a Resume Optimizer with AI (Code ...,https://i.ytimg.com/vi/R5WXaxmb6m4/hqdefault.jpg,Topological Data Analysis (TDA) | An introduction
3,e3p9-hYxwSQ,How I’d Learn AI in 2025 (if I could start over),https://i.ytimg.com/vi/e3p9-hYxwSQ/hqdefault.jpg,4 Ways to Measure Fat Tails with Python (+ Exa...
4,Y7pNmocrmi8,Multimodal RAG: A Beginner-friendly Guide (wit...,https://i.ytimg.com/vi/Y7pNmocrmi8/hqdefault.jpg,What Nature Can Teach Us About Business...


#### train-test split

In [13]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 20% test)
train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

### Load

In [14]:
# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'valid': valid_ds,
    'test': test_ds
})

In [15]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 53
    })
    valid: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 11
    })
    test: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 12
    })
})

In [16]:
# push data to hub
dataset_dict.push_to_hub("shawhin/yt-title-thumbnail-pairs")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shawhin/yt-title-thumbnail-pairs/commit/f97327c11b7ddca68b7cfa4c9a225ac8fd987866', commit_message='Upload dataset', commit_description='', oid='f97327c11b7ddca68b7cfa4c9a225ac8fd987866', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/shawhin/yt-title-thumbnail-pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='shawhin/yt-title-thumbnail-pairs'), pr_revision=None, pr_num=None)