In [1]:
import datasets
import torch

In [2]:
# get device proper device to run on 
if torch.cuda.is_available():
    device = "cuda"
else: 
    # calculating the embeddings on a cpu can take hours, not recommended
    device = "cpu"
print(device)

cuda


In [3]:
video_data = datasets.Dataset.load_from_disk("../data/videos/")
all_records_data = datasets.Dataset.load_from_disk("../data/all_records/")
print(video_data)
print(all_records_data)

Dataset({
    features: ['id', 'title', 'description', 'text', 'channel'],
    num_rows: 7908
})
Dataset({
    features: ['title', 'views', 'time-stamp-upload-milliseconds', 'time-stamp', 'date-time-hr', 'upload-time-hr', 'upload-time-stamp', 'description', 'video-length', 'video-length-milliseconds', 'channel', 'id'],
    num_rows: 1369223
})


In [4]:
from transformers import AutoTokenizer, AutoModel

model_name = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
def tokenize_text(text):
    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
    )
    return encoded_input

def tokenize_record(record):
    return tokenize_text(record["text"])

In [8]:
data_tokenized = video_data.map(tokenize_record)
data_tokenized.set_format("torch")

Map:   0%|          | 0/7908 [00:00<?, ? examples/s]

In [21]:
def get_embeddings(input):
    with torch.no_grad():
        model_output = model.encoder(
            input_ids=input["input_ids"].to(device),
            attention_mask=input["attention_mask"].to(device),
        )
        last_hidden_state = model_output.last_hidden_state

    return dict(embeddings=last_hidden_state)

In [27]:
data_tokenized = data_tokenized.map(get_embeddings)

In [28]:
# save to disk
data_tokenized.select_columns(["id", "embeddings"]).save_to_disk("../data/embeddings")
# can be read with
# datasets.Dataset.load_from_disk("../data/embeddings/")