In [7]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

In [19]:
#ici,
#token = char
#seq = mot

class CharDataset(Dataset):

    def __init__(self, words, chars, max_word_length):
        #words = ['perols', 'palavas', ...]
        #chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g'
        
        #stoi : char vers int
        #itos : int vers char
        
        self.words = words
        self.chars = chars
        self.max_word_length = max_word_length
        self.stoi = {ch:i+1 for i,ch in enumerate(chars)}
        self.itos = {i:s for s,i in self.stoi.items()} # inverse mapping

    #nb de séquences (ie mots)
    def __len__(self):
        return len(self.words)

    # test si contient une certaine séquence (ie mot)
    def contains(self, word):
        return word in self.words

    def get_vocab_size(self):
        return len(self.chars) + 1 # all the possible characters and special 0 token

    def get_output_length(self):
        return self.max_word_length + 1 # <START> token followed by words

    def encode(self, word):
        ix = torch.tensor([self.stoi[w] for w in word], dtype=torch.long)
        return ix

    def decode(self, ix):
        word = ''.join(self.itos[i] for i in ix)
        return word

    def __getitem__(self, idx):
        word = self.words[idx]
        ix = self.encode(word)
        x = torch.zeros(self.max_word_length + 1, dtype=torch.long)
        y = torch.zeros(self.max_word_length + 1, dtype=torch.long)
        x[1:1+len(ix)] = ix
        y[:len(ix)] = ix
        y[len(ix)+1:] = -1 # index -1 will mask the loss at the inactive locations
        return x, y

In [9]:
with open("villes.txt", 'r') as f:
        data = f.read()

In [10]:
words = data.splitlines()
words = [w.strip() for w in words] # get rid of any leading or trailing white space
words = [w for w in words if w] # get rid of any empty strings
chars = sorted(list(set(''.join(words)))) # all the possible characters
max_word_length = max(len(w) for w in words)

print(f"number of examples in the dataset: {len(words)}")
print(f"max word length: {max_word_length}")
print(f"number of unique characters in the vocabulary: {len(chars)}")
print("vocabulary:")
print(''.join(chars))

number of examples in the dataset: 36583
max word length: 45
number of unique characters in the vocabulary: 43
vocabulary:
 '-abcdefghijklmnopqrstuvwxyzàâçèéêëîïôûüÿœ


In [11]:
# partition the input data into a training and the test set
test_set_size = min(1000, int(len(words) * 0.1)) # 10% of the training set, or up to 1000 examples
rp = torch.randperm(len(words)).tolist()
train_words = [words[i] for i in rp[:-test_set_size]]
test_words = [words[i] for i in rp[-test_set_size:]]
print(f"split up the dataset into {len(train_words)} training examples and {len(test_words)} test examples")

split up the dataset into 35583 training examples and 1000 test examples


In [12]:
train_dataset = CharDataset(train_words, chars, max_word_length)
test_dataset = CharDataset(test_words, chars, max_word_length)

In [22]:
train_dataset

<__main__.CharDataset at 0x7f4a72357ad0>

In [23]:
class InfiniteDataLoader:
    """
    this is really hacky and I'm not proud of it, but there doesn't seem to be
    a better way in PyTorch to just create an infinite dataloader?
    """

    def __init__(self, dataset, **kwargs):
        train_sampler = torch.utils.data.RandomSampler(dataset, replacement=True, num_samples=int(1e10))
        self.train_loader = DataLoader(dataset, sampler=train_sampler, **kwargs)
        self.data_iter = iter(self.train_loader)

    def next(self):
        try:
            batch = next(self.data_iter)
        except StopIteration: # this will technically only happen after 1e10 samples... (i.e. basically never)
            self.data_iter = iter(self.train_loader)
            batch = next(self.data_iter)
        return batch

In [24]:
mon_loader = InfiniteDataLoader(train_dataset)

In [34]:
x, y = mon_loader.next()

In [35]:
x, y

(tensor([[ 0, 15,  4,  1, 17,  8, 24, 25, 12, 15, 15,  8,  3, 22, 12, 21,  8,  3,
           5,  8, 21, 17,  4, 21,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]),
 tensor([[15,  4,  1, 17,  8, 24, 25, 12, 15, 15,  8,  3, 22, 12, 21,  8,  3,  5,
           8, 21, 17,  4, 21,  7,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]))