In [52]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import sentencepiece as spm

class LanguageDataset(Dataset):                                  

    def __init__(self, tokenizer, file, max_length):
        self.tokens = tokenizer(file)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.tokens)
              
    # input_ids attention_mask encoder_mask decoder_mask 
    def __getitem__(self, idx):
        return self.tokens[idx]

In [53]:
# file -> token
def tokenizer_example(file, model_prefix='nepali_spm', vocab_size=504, character_coverage=1.0, model_type='bpe'):
    spm.SentencePieceTrainer.train(
            f'--input={file} --model_prefix={model_prefix} --vocab_size={vocab_size} \
            --character_coverage={character_coverage} --model_type={model_type}'
        )
    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model')
    with open(file) as f:
        text = f.read()
    tokens = sp.encode_as_pieces(text)
    return tokens

In [48]:
tokenizer_example('/Users/ryanmarr/Downloads/train_ne_small.txt')

['▁प्याच',
 '▁गरिने',
 '▁फाइल',
 '/',
 'डाइरेक्टरी',
 '▁image',
 '-',
 'action',
 '▁प्रशारण',
 '▁ठेगाना',
 ':',
 'Subnet',
 '▁Mask',
 '▁पाँच',
 '▁वा',
 '▁बढी',
 '▁प्रकार',
 '▁प्राप्त',
 '▁गर्नका',
 '▁लागि',
 '▁कुञ्जी',
 '▁वा',
 '▁कुञ्जीहरू',
 '▁निर्दिष्ट',
 '▁गर्नु',
 '▁पर्दछ',
 '▁यसमा',
 '▁सञ्चालन',
 '▁गर्न',
 '▁पीडीए',
 '▁निर्दिष्ट',
 '▁गर्नुहोस्',
 '▁(',
 'MyPDA',
 '▁मा',
 '▁पूर्वनिर्धारित',
 '▁हुन्छ',
 ')',
 '▁जगेडा',
 '▁पहिचायक',
 '▁Description',
 '▁Query',
 '▁फोल्डर',
 '▁सिर्जना',
 '▁गर्न',
 '▁सकेन',
 '▁।',
 '▁लुयन्डा',
 'africa',
 '.',
 '▁kgm',
 '▁अघिल्लो',
 '▁चेक',
 '▁बाकसमा',
 '▁जान्छ',
 '▁।']

In [54]:
langdset = LanguageDataset(tokenizer=tokenizer_example,
                           file='/Users/ryanmarr/Downloads/train_ne_small.txt', 
                           max_length=10
                           )

In [55]:
langdset[5]

'▁image'

In [15]:
# attn mask input target for batch size
batch_size = 10
shuffle = True
data_loader = DataLoader(dataset=langdset, batch_size=batch_size, shuffle=shuffle)