In [1]:
import torch
from torch.utils.data import DataLoader
import sentencepiece as spm
import transforms
from dataset_extracted import ExtractedFeatureDataset
import os

In [2]:
path = '../MVAD/I3D_rgb/'
train_corpus_path = '../MVAD/corpus_M-VAD_train.txt'
test_corpus_path = '../MVAD/corpus_M-VAD_test.txt'
train_path = os.path.join(path, 'train')
test_path = os.path.join(path, 'test')
tokenizer_file = 'tokenizer.model'
inp_max_sequence_size = 256
tar_max_sequence_size = 360

In [3]:
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_file)

True

In [5]:
[sp.PieceToId('<s>')] + sp.EncodeAsIds('I love you') + [sp.PieceToId('</s>')]

[1, 680, 1934, 771, 2]

In [6]:
print(sp.EncodeAsPieces(' <s> I love you </s> '))
print(sp.EncodeAsIds(' <s> I love you </s> '))

['▁', '<', 's', '>', '▁I', '▁love', '▁you', '▁', '</', 's', '>']
[22, 3, 5, 3, 680, 1934, 771, 22, 3, 5, 3]


In [7]:
feature_transform = transforms.Compose([
    transforms.FeaturePadding(inp_max_sequence_size)
])
caption_transform = transforms.Compose([
    transforms.CaptionPadding(tar_max_sequence_size, sp.PieceToId('<PAD>'))
])

In [8]:
train_dataset = ExtractedFeatureDataset(train_path, train_corpus_path, 256, 360, sp, feature_transform, caption_transform)
test_dataset = ExtractedFeatureDataset(test_path, test_corpus_path, 256, 360, sp, feature_transform, caption_transform)

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [11]:
sample = next(iter(test_dataloader))

In [42]:
sample[0][0]

tensor([[0.0319, 0.0138, 0.0786,  ..., 0.0933, 0.4781, 0.0654],
        [0.0616, 0.0155, 0.0997,  ..., 0.1578, 0.5022, 0.0849],
        [0.0740, 0.0111, 0.1320,  ..., 0.2436, 0.4955, 0.1102],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])