Simple NLP model for TinyStories dataset

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

# Loading dataset

In [5]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
train_dataset = load_dataset("roneneldan/TinyStories", split="train")
valid_dataset = load_dataset("roneneldan/TinyStories", split="validation")

In [10]:
# Check elements in datset
print(train_dataset[0]['text'])

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.


In [16]:
# First, try with character dictionary.
# Because word dictionary might be really huge

train_total_texts = [example['text'] for example in train_dataset]
train_full_text = '\n'.join(train_total_texts)

vocab = sorted(set(train_full_text))
print(f"total character in text are {vocab}")
train_stoi = {ch: i for i, ch in enumerate(vocab)}
train_itos = {i: ch for i, ch in enumerate(vocab)}

total character in text are ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'Â', 'Ã', 'Ä', 'Å', 'É', 'Ê', 'Ñ', 'Ò', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'î', 'ï', 'ð', 'Œ', 'œ', 'Š', 'š', 'Ÿ', 'Ž', 'ž', 'ƒ', 'ˆ', '˜', '–', '‘', '’', '‚', '“', '”', '„', '†', '‡', '•', '…', '‰', '‹', '›', '€', '™']


In [17]:
valid_total_texts = [example['text'] for example in valid_dataset]
valid_full_text = '\n'.join(valid_total_texts)

vocab = sorted(set(valid_full_text))
print(f"total character in text are {vocab}")
valid_stoi = {ch: i for i, ch in enumerate(vocab)}
valid_itos = {i: ch for i, ch in enumerate(vocab)}

total character in text are ['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¦', '©', '\xad', '±', '´', 'Â', 'Ã', 'â', 'ð', 'œ', 'Š', 'Ÿ', 'Ž', '˜', '“', '”', '‹', '€', '™']


In [15]:
class CharDataset(Dataset):
    def __init__(self, text, stoi, block_size):
        """
        text: Total text given in datset
        stoi: Mapping dictionary
        block_size: input sequence length
        """
        self.block_size = block_size
        self.tokens = [stoi[c] for c in text]
        
    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx: idx + self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx + 1: idx + self.block_size + 1], dtype=torch.long)
        return x, y

In [None]:
block_size = 128
batch_size = 32

train_dataset = CharDataset(train_full_text, train_stoi, block_size)
valid_dataset = CharDataset(valid_full_text, valid_stoi, block_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
