In [7]:
from utils import load_data
import datasets
import torch

In [2]:
# get device proper device to run on 
if torch.cuda.is_available():
    device = "cuda"
else: 
    # calculating the embeddings on a cpu can take hours, not recommended
    device = "cpu"

In [25]:
all_records_df = load_data("../data/")

# find and delete null values
print("counting null values")
print(all_records_df.isnull().sum())

all_records_df.dropna(axis=0, inplace=True)

# also we want a df of individual videos that are identified by title and description
videos_df = all_records_df.drop_duplicates(["title", "description"], keep="last")
# note: if you want the first record in videos_df, enter vidoes_df.iloc[0] not videos_df[0]
# as the later will try to return the record that has index or id = 0  wich is the index in the all_records_df

counting null values
title                             1
views                             0
time-stamp-upload-milliseconds    0
time-stamp                        0
date-time-hr                      0
upload-time-hr                    0
upload-time-stamp                 0
description                       0
video-length                      0
video-length-milliseconds         0
channel                           0
dtype: int64


In [29]:
def combine_record_to_string(record):
    """
    record should be Dict-like
    record should contain keys "title" and "description"
    this will return a single string containing title and description information
    """
    title = record["title"].strip()
    description = record["description"].strip()

    out = "Title: '{}' Description: '{}'".format(title, description)
    return out

In [30]:
# turn the dataframe into a datasets.Dataset with cols "text", "id"
combined = videos_df[["title", "description"]].apply(combine_record_to_string, axis=1)
data = datasets.Dataset.from_pandas(combined.to_frame("text"))
data = data.rename_column("__index_level_0__", "id")
data

Dataset({
    features: ['text', 'id'],
    num_rows: 7908
})

In [31]:
from transformers import AutoTokenizer, AutoModel

model_name = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [32]:
def tokenize_text(text):
    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
    )
    return encoded_input

def tokenize_record(record):
    return tokenize_text(record["text"])

In [33]:
data_tokenized = data.map(tokenize_record)
data_tokenized.set_format("torch")

Map:   0%|          | 0/7908 [00:00<?, ? examples/s]

In [34]:
def get_embeddings(input):
    with torch.no_grad():
        model_output = model.encoder(
            input_ids=input["input_ids"].to(device),
            attention_mask=input["attention_mask"].to(device),
        )
        last_hidden_state = model_output.last_hidden_state

    return dict(embeddings=last_hidden_state)

In [35]:
data_tokenized = data_tokenized.map(get_embeddings)

Map:   0%|          | 0/7908 [00:00<?, ? examples/s]

In [38]:
# save to disk
data_tokenized.select_columns(["id", "embeddings"]).save_to_disk("../data/embeddings")
# can be read with
# datasets.Dataset.load_from_disk("../data/embeddings/")

Saving the dataset (0/25 shards):   0%|          | 0/7908 [00:00<?, ? examples/s]