In [69]:
from utils import load_data, combine_record_to_string
import datasets
import torch

In [127]:
# get device proper device to run on 
if torch.cuda.is_available():
    device = "cuda"
else: 
    # calculating the embeddings on a cpu can take hours, not recommended
    device = "cpu"

In [2]:
all_records_df = load_data("../data/")

# also we want a df of individual videos that are identified by title and description
videos_df = all_records_df.drop_duplicates(["title", "description"], keep="last")
# note: if you want the first record in videos_df, enter vidoes_df.iloc[0] not videos_df[0]
# as the later will try to return the record that has index or id = 0  wich is the index in the all_records_df

In [50]:
# turn the dataframe into a datasets.Dataset with cols "text", "id"
combined = videos_df[["title", "description"]].apply(combine_record_to_string, axis=1)
data = datasets.Dataset.from_pandas(combined.to_frame("text"))
data = data.rename_column("__index_level_0__", "id")
data

Dataset({
    features: ['text', 'id'],
    num_rows: 7909
})

In [130]:
from transformers import AutoTokenizer, AutoModel

model_name = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [62]:
def tokenize_text(text):
    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
    )
    return encoded_input

def tokenize_record(record):
    return tokenize_text(record["text"])

In [66]:
data_tokenized = data.map(tokenize_record)
data_tokenized.set_format("torch")

Map:   0%|          | 0/7909 [00:00<?, ? examples/s]

In [132]:
def get_embeddings(input):
    with torch.no_grad():
        model_output = model.encoder(
            input_ids=input["input_ids"].to(device),
            attention_mask=input["attention_mask"].to(device),
        )
        last_hidden_state = model_output.last_hidden_state

    return dict(embeddings=last_hidden_state)

In [137]:
data_tokenized = data_tokenized.map(get_embeddings)

Map:   0%|          | 0/7909 [00:00<?, ? examples/s]

ArrowMemoryError: realloc of size 805306368 failed