In [53]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
from exp.nb_XLibrary import *

In [3]:
from fastai import datasets as FAdatasets
import random

# Road map for a simple supervised learning problem
### Preprocessing step
1. Setup paths and download raw data
2. Read the files and put data in a container (TextList object)
    - TextList --> hold **all x data**
3. Split data into train/valid (possibly test) set (SplitData object)
    - SplitData --> contain **x data** split into train/valid sets
4. Text preprocessing: cleaning up and tokenization (TokenizeProcessor)
5. Numericalization (NumericalizeProcessor)
6. (Supervised only) Labeling data --> (x,y) pairs (LabeledData object)
    - Train/valid set each containing all **(x, y)** data, respectively
    - Label should be done *after* splitting
    - yields **SplitData holding LabeledList objects** for each train/valid sets
    - For *LM tasks*, just use a dummy label for y now; (x,y) pair will be made in the batching step
7. Make minibatches
   (1) Prepare batches --> bs and bptt (LMPreLoader object)
   (2) Batching for classification
8. Dataloader and databunch

# 1. Path and download data

In [4]:
home_dir = Path('.').resolve()
data_dir = home_dir/'data'
data_dir

PosixPath('/Users/xianli/Desktop/fast/Part2/data')

In [5]:
file_path = FAdatasets.untar_data(FAdatasets.URLs.IMDB, dest=data_dir)
file_path

PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb')

In [6]:
file_path.ls()

[PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/tmp_clas'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/imdb.vocab'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/unsup'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/README'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/tmp_lm'),
 PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/train')]

# 2. Read data into a TextList container

In [7]:
#export
def read_file(fn):
    with open(fn,'r', encoding='utf8') as f:
        return f.read() # this is reading all the contents
    
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions,recurse=recurse,include=include), path, **kwargs)
    # get_files is a standalone function that return all file paths in all folders
    # the second 'path' is for the parent ItemList initialization
    # Note the entire get_files(path, extensions,recurse=recurse,include=include)
    # is items input in the ItemList class initialization
    
    def get(self,i):
        # overload parent get method --> show how to access individual data
        if isinstance(i, Path):
            return read_file(i)
        return i

In [8]:
item_list = TextList.from_files(file_path, include=['train','test','unsup']) # concat together

In [9]:
len(item_list.items)

100000

In [10]:
# __getitem__ work flow:
# 1. root class list_container returns the item (a file path in here) corresponding to the index
# 2. the item(s) then go through ItemList private _get method and take any transformation provided
# 3. public get method is called and use the TextList get method to read the item (i.e. file path)
txt = item_list[0:3]
txt

["Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook.",
 'I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the United 

# 3. Split data for language modeling (no label, so can use all data)
- use all texts and leave 10% behind

In [11]:
#export
# must return booleans
def random_splitter(fn, p_valid): return random.random() < p_valid

In [12]:
# this split_by_func works by assigning True/False to each item according to the provided function
sd = SplitData.split_by_func(item_list, partial(random_splitter, p_valid=0.1))
sd

SplitData
Train: TextList (89945 items)
[PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/1821_4.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/9487_1.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/4604_4.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/2828_2.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/10890_1.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/3351_4.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/8070_2.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/1027_4.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/8248_3.txt'), PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/4290_4.txt')...]
Path: /Users/xianli/Desktop/fast/Part2/data/imdb
Valid: TextList (10055 items)
[PosixPath('/Users/xianli/Desktop/fast/Part2/data/imdb/test/neg/10096_1.txt'), PosixPath('/Users/xianli/De

# Tokenizing

In [14]:
#export
# html here is just to clean up HTML stuff
import spacy, html

### Pre-tokenizing rules

In [15]:
#export

# special tokens to replace original texts
from typing import Collection

UNK = 'xxunk' # unknown
PAD = 'xxpad'
BOS = 'xxbos' # beginning of sentence
EOS = 'xxeos' # end of sentence
TK_REP = 'xxrep' # replace characters that repeated at least 4 times (e.g. aaaa) with the token
# e.g. cccc --> xxrep 4 c
TK_WREP = 'xxwrep' # replace repeated words with token
# e.g. ha ha ha ha --> xxwrep 4 ha
TK_UP = 'xxup' # ALL CAPS --> xxup all xxup caps
TK_MAJ = 'xxmaj' # Capitialized Words --> xxmaj captialized xxmaj words

def sub_br(t):
    "Replace <br /> by /n"
    re_br = re.compile(r'<\s*br\s*/?>') # ? matches 0 or 1 time
    return re_br.sub("\n", t)
def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])',r' \1 ', t) # \1 is group 1
def rm_useless_spaces(t):
    'Remove multiple spaces'
    return re.sub(' {2,}',' ',t) # {2,} means at least 2 repetition, no upper limit

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups() # e.g. if aaaa is in text, c=a, cc=aaa (one less than it should be)
        return f' {TK_REP} {len(cc)+1} {c} ' # re counts 1 less
    re_rep = re.compile(r'(\S)(\1{3,})') # \S is any non-whitespace character
    return re_rep.sub(_replace_rep, t) # regex can pass function
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents --> particularly in html"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

# these rules are functions to be passed to compose
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]


In [16]:
re_wrep = re.compile(r'(\S)(\1{3,})')
a = 'b aaaa ddd ccccc'
out = re_wrep.findall(a)
out

[('a', 'aaa'), ('c', 'cccc')]

### Post-tokenizing rules

In [17]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

### Parallel processing

In [18]:
#export
from concurrent.futures import ProcessPoolExecutor
from fastprogress import progress_bar

def parallel(func, arr, max_workers=4):
    'Version that using fast.ai progress_bar class'
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

def parallel_nobar(func, arr, max_workers=4):
    'Version that does NOT use fastai progress_bar class'
    if max_workers < 2:
        results = list(map(func, enumerate(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(ex.map(func, enumerate(arr)))
    if any([o is not None for o in results]):
        return results
    

In [19]:
#export
from spacy.symbols import ORTH # Orth: The hash value of the lexeme (i.e. word)

class TokenizeProcessor(Processor):
    '''apply pre_rules, special_tokens, tokenizing and post_rules to
    a list of texts'''
    def __init__(self, lang='en', chunksize=2000, pre_rules=None,
                post_rules=None, max_workers=4):
        self.chunksize = chunksize
        self.max_workers = max_workers
        self.tokenizer = spacy.blank(lang).tokenizer # here used Spacy tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}]) # what does ORTH do?
        self.pre_rules = default_pre_rules if pre_rules is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules
    
    def __call__(self, items): # items must be a list or tuple of texts
        toks = []
        if isinstance(items[0], Path): 
            items = [read_file(i) for i in items] # list of articles
        # chunks are divided by number of articles, not content length
        chunks = [items[i:i+self.chunksize] for i in range(0,len(items), self.chunksize)]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
#         toks = parallel_nobar(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, []) # combine token lists, sum(arr,[]) is a good trick to concat lists of lists
        
    def proc_chunk(self, args): # args is a list
        i, chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk] # apply pre_rules
        # tokenizing happens here
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)] # docs=list of token lists (each doc is a token list)
        docs = [compose(t, self.post_rules) for t in docs] # apply post_rules
        return docs
    
    def proc1(self,item):
        'Process 1 item'
        return self.proc_chunk([item])[0] # return list content b/c only 1 doc
    def deprocess(self, toks):
        'convert tokens back to a string'
        return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):
        'convert a token to a string'
        return ' '.join(tok)
        

In [20]:
a = [[1, 2], [3, 4], [5, 6]]
sum(a, [])

[1, 2, 3, 4, 5, 6]

In [21]:
test_text = item_list[0:2]
test_text, len(test_text)

(["Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook.",
  'I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the Unite

In [24]:
tp = TokenizeProcessor()
'|'.join(tp(test_text)[1])

'xxbos|i|have|seen|this|movie|and|i|did|not|care|for|this|movie|anyhow|.|i|would|not|think|about|going|to|xxmaj|paris|because|i|do|not|like|this|country|and|its|national|capital|.|i|do|not|like|to|learn|french|anyhow|because|i|do|not|understand|their|language|.|xxmaj|why|would|i|go|to|xxmaj|france|when|i|rather|go|to|xxmaj|germany|or|the|xxmaj|united|xxmaj|kingdom|?|xxmaj|germany|and|the|xxmaj|united|xxmaj|kingdom|are|the|nations|i|tolerate|.|xxmaj|apparently|the|xxmaj|olsen|xxmaj|twins|do|not|understand|the|xxmaj|french|language|just|like|me|.|xxmaj|therefore|i|will|not|bother|the|xxmaj|france|trip|no|matter|what|.|i|might|as|well|stick|to|the|xxmaj|united|xxmaj|kingdom|and|meet|single|women|and|play|video|games|if|there|is|a|video|arcade|.|xxmaj|that|is|all|.|xxeos'

In [25]:
len(tp(test_text)[0])

140

In [None]:
# test nobar version parallel (change just one line to parallel_nobar)
tp = TokenizeProcessor()
'|'.join(tp(test_text)[1]) # works!

# Numericalization

In [22]:
#export
from collections import Counter, defaultdict

class NumericalizeProcessor(Processor):
    '''Turn tokens into numbers, set max_vocab and min_freq'''
    def __init__(self, vocab = None, max_vocab = 60000, min_freq=2):
        self.vocab = vocab
        self.max_vocab = max_vocab
        self.min_freq = min_freq
    
    def __call__(self, items): # items are token lists
        # Vocab defined at first use
        if self.vocab is None:
            freq = Counter(p for o in items for p in o) # loop through all docs and all tokens in each doc
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c>= self.min_freq] # build vocab from high to low frequency words
            for o in reversed(default_spec_tok):
                if o in self.vocab: # remove special tokens from the vocab
                    self.vocab.remove(o)
                self.vocab.insert(0,o) # insert special tokens back to the beginning following the original order
        
        if getattr(self,'otoi', None) is None:
            # build reverse dict, 
            self.otoi = defaultdict(int,{v:k for k,v in enumerate(self.vocab)})
        # build vocab done above
        return [self.proc1(o) for o in items] # process one doc at a time
    
    def proc1(self, item):
        # label by index (higher to lower frequency except special tokens)
        return [self.otoi[o] for o in item] # process one document/sentence
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(i) for i in idxs]
    def deproc1(self,idx):
        return [self.vocab[i] for i in idx]

# For LM task only
- When we do language modeling, we will infer the labels from the text during training, so there's no need to label. The training loop expects labels however, so we need to add dummy ones.

In [23]:
# prepare processors
proc_tok= TokenizeProcessor(max_workers=8)
proc_num = NumericalizeProcessor()

In [24]:
# lambda x: 0 is just a dummy label because we don't need it
'''Note in the process, we keep the good practice that we label
using the train data and use the same labels for validation set

We have ~90,000 docs/texts for training set ==> 45-46 chunks
~ 10,000 docs for validation set ==> 5-6 chunks
'''
ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

In [29]:
ll # LabeledList item

SplitData
Train: LabeledData
x: TextList (90084 items)
[[2, 7, 1837, 7, 9790, 205, 7, 3680, 7, 4334, 220, 67, 374, 27, 2455, 124, 7, 183, 7, 5533, 2613, 17, 19, 1216, 508, 9, 7, 16, 22, 295, 334, 28, 80, 36164, 11, 7, 9790, 22, 150, 27, 7, 4241, 7, 17590, 9, 7, 151, 297, 171, 0, 14, 2825, 199, 74, 520, 64, 8, 29, 10, 16, 209, 9, 7, 8, 130, 28, 8, 29, 296, 1045, 10, 30, 43, 240, 8, 1468, 17, 168, 8, 31, 20, 16, 25, 37, 2001, 141, 26, 88, 26, 16, 95, 41, 99, 9, 7, 8, 211, 20, 16, 15, 16906, 47, 12, 271, 2728, 3383, 11, 680, 2728, 12154, 83, 44736, 200, 9, 7, 8, 29, 15, 295, 12, 0, 63, 28, 178, 68, 93, 439, 374, 47, 7, 9790, 10, 7, 4334, 10, 11, 7, 17590, 9, 3], [2, 18, 41, 131, 19, 29, 11, 18, 87, 37, 476, 28, 19, 29, 6730, 9, 18, 73, 37, 122, 59, 182, 14, 7, 1492, 107, 18, 58, 37, 53, 19, 664, 11, 112, 2028, 5847, 9, 18, 58, 37, 53, 14, 868, 704, 6730, 107, 18, 58, 37, 409, 80, 1076, 9, 7, 153, 73, 18, 159, 14, 7, 2113, 69, 18, 271, 159, 14, 7, 2530, 55, 8, 7, 2237, 7, 4695, 66, 7, 2530

In [25]:
ll.train.x_obj(0) # access the underlying texts

"xxbos xxmaj alan xxmaj rickman & xxmaj emma xxmaj thompson give good performances with southern / xxmaj new xxmaj orleans accents in this detective flick . xxmaj it 's worth seeing for their scenes- and xxmaj rickman 's scene with xxmaj hal xxmaj holbrook . xxmaj these three actors xxunk to entertain us no matter what the movie , it seems . xxmaj the plot for the movie shows potential , but one gets the impression in watching the film that it was not pulled off as well as it could have been . xxmaj the fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things . xxmaj the movie is worth a xxunk if for nothing more than entertaining performances by xxmaj rickman , xxmaj thompson , and xxmaj holbrook . xxeos"

# Save the intermediate data

In [35]:
import pickle
pickle.dump(ll, open(data_dir/'ld.pkl', 'wb'))

In [None]:
ll = pickle.load(open(data_dir/'ld.pkl', 'rb'))

# Making Batches for LM tasks (predict next word) and wrapping everything into databunch
### Making (x,y) pairs where y is the next word of x 
1. x.shape = y.shape = (bs, bptt) 
2. `bs` vs. `seq_len` vs `bptt` 
    - `bs` is the number of docs/sentences we are working on at the same time
    - `seq_len` defines length of a sentence == how many consecutive tokens are we considered as a single sentence == how many words are in between this batch and the next batch
    - `bptt` is how many tokens our RNN will backprop through before it's forgotten == length of RNN loop
3. For the same row in a batch, texts are streamed from batch to batch (i.e. the texts are continuing on the same row from this to the next batch)
4. Option to shuffle texts

In [26]:
#export
class LM_PreLoader():
    'prepare next word prediction (x, y) batches'
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data = data
        self.bs = bs # number of sentences to work on at the same time
        self.bptt = bptt # backprop through time = # of tokens our RNN will backprop through before it's forgotten 
        self.shuffle = shuffle
        # --> batch shape = (bs, bptt)
        total_len = sum([len(t) for t in data.x]) # total number of tokens in all data
        self.n_batch = total_len//bs # number of tokens in a "sentence (batch)"
        self.batchify() #
    
    def batchify(self):
        texts = self.data.x # texts are already token indices
        if self.shuffle:
            texts = texts[torch.randperm(len(texts))] # shuffle
        stream = torch.cat([torch.tensor(t) for t in texts]) # turn token indices into tensors
        self.batched_data = stream[:self.n_batch*self.bs].view(self.bs, self.n_batch) # total_len ~= n_batch * bs, this is to be divided again when getitem
        
    def __len__(self): # total number of batches
        return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        '''returns (x,y) pair where y is the next word of x
        '''
        source = self.batched_data[idx % self.bs] # row number in a batch
        seq_idx = (idx //self.bs) * self.bptt # ???? to be understood
        return source[seq_idx:seq_idx+self.bptt], source[seq_idx+1:seq_idx+self.bptt+1]
    
    def __repr__(self):
        return f'(x,y) batch maker for LM task (predict next word)\n Sentence length = {self.n_batch}' 

In [27]:
dl = DataLoader(LM_PreLoader(ll.train, shuffle=False), batch_size=64)

In [28]:
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
vocab = proc_num.vocab # numericalization processor
x1_text = " ".join(vocab[o] for o in x1[0])
y1_text = " ".join(vocab[o] for o in y1[0])
x1_text, y1_text

("xxbos xxmaj alan xxmaj rickman & xxmaj emma xxmaj thompson give good performances with southern / xxmaj new xxmaj orleans accents in this detective flick . xxmaj it 's worth seeing for their scenes- and xxmaj rickman 's scene with xxmaj hal xxmaj holbrook . xxmaj these three actors xxunk to entertain us no matter what the movie , it seems . xxmaj the plot for the movie shows potential",
 "xxmaj alan xxmaj rickman & xxmaj emma xxmaj thompson give good performances with southern / xxmaj new xxmaj orleans accents in this detective flick . xxmaj it 's worth seeing for their scenes- and xxmaj rickman 's scene with xxmaj hal xxmaj holbrook . xxmaj these three actors xxunk to entertain us no matter what the movie , it seems . xxmaj the plot for the movie shows potential ,")

In [29]:
x1.size(), y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [30]:
x1_text = " ".join(vocab[o] for o in x1[0])
x1_2ndrow_text = " ".join(vocab[o] for o in x1[1])
x1_text, x1_2ndrow_text

("xxbos xxmaj alan xxmaj rickman & xxmaj emma xxmaj thompson give good performances with southern / xxmaj new xxmaj orleans accents in this detective flick . xxmaj it 's worth seeing for their scenes- and xxmaj rickman 's scene with xxmaj hal xxmaj holbrook . xxmaj these three actors xxunk to entertain us no matter what the movie , it seems . xxmaj the plot for the movie shows potential",
 ') were obviously inspired by xxmaj sam xxmaj raimi . xxmaj but the camera work is a bad copy of what can be seen in " xxmaj the xxmaj evil xxmaj dead " and elsewhere . xxmaj some other users have written that they enjoyed the humor of this film but i did n\'t . \n\n xxmaj the film rather disturbed than entertained me . xxmaj it tries to combine')

In [31]:
print(dl.dataset)

(x,y) batch maker for LM task (predict next word)
 Sentence length = 421513


In [32]:
#export
# convenience functions for language model tasks
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))
def lm_databunchify(sd, bs, bptt, **kwargs):
    return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))

In [33]:
# wrap everything together

bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt) # all raw data

In [34]:
data.train_ds[0][0].shape, data.train_ds[0][1].shape

(torch.Size([70]), torch.Size([70]))

# Batching for classification tasks (e.g. sentiment analysis)
- Label data by the folder they come from
- Padding to make batches have the same sizes
- To avoid mixing very long texts with very short ones, we will also use `Sampler` to sort (with a bit of randomness for the training set) our samples by length.

In [55]:
proc_cat = CategoryProcessor()

In [56]:
# start from beginning
il = TextList.from_files(file_path, include=['train','test'])
train_valid_splitter = partial(grandparent_splitter, valid_name='test')
sd = SplitData.split_by_func(il, train_valid_splitter)
ll = label_by_func(sd, parent_labeler, 
                   proc_x = [proc_tok, proc_num],
                   proc_y = proc_cat)

In [57]:
set(ll.y) # only two labels because only two classes: pos and neg

{0, 1}

In [58]:
ll.train

LabeledData
x: TextList (25000 items)
[[2, 7, 761, 27, 43, 13, 8, 138, 7, 2266, 6263, 10, 19, 31, 1002, 14, 42, 18107, 14, 16, 22, 2253, 10, 1875, 152, 2559, 14, 12, 6406, 329, 9, 24, 7, 5934, 2303, 8, 31, 51, 476, 7, 11479, 22, 3366, 10, 11, 54, 22, 12, 1048, 196, 34, 67, 833, 9, 3], [2, 7, 89, 92, 14794, 18, 10, 8, 231, 676, 142, 17, 4464, 11, 18, 276, 8, 29, 205, 759, 14, 126, 9, 215, 10, 46, 12469, 14, 114, 14794, 1396, 11, 3580, 9, 7, 1719, 88, 10, 165, 118, 676, 181, 4301, 228, 119, 46, 1829, 8, 107, 43, 10, 18, 406, 10, 1795, 29840, 66, 66, 66, 7, 166, 10, 81, 7, 560, 332, 15, 2079, 13, 10966, 8, 902, 21, 154, 17, 7, 5871, 411, 73, 46, 1019, 175, 43, 13, 151, 18101, 13526, 13, 12, 29, 66, 21, 7, 14794, 5965, 78, 37, 42, 1256, 12, 97, 29, 10, 17, 212, 16, 78, 37, 42, 76, 1256, 48, 9179, 13, 12, 97, 29, 10, 28, 16, 475, 72, 14, 68, 93, 20, 9, 7, 26, 18, 3180, 14, 866, 172, 16, 10, 18, 1966, 20, 77, 543, 676, 14, 8906, 10, 11, 18, 3598, 27060, 20, 8, 139, 261, 51, 8, 1588, 73, 7476

In [60]:
import pickle
pickle.dump(ll, open(data_dir/'ll_clas.pkl', 'wb'))

In [None]:
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))

In [61]:
[(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,12552]]

[('xxbos xxmaj well ... tremors i , the original started off in 1990 and i found the movie quite enjoyable to watch . however , they proceeded to make tremors ii and iii . xxmaj trust me , those movies started going downhill right after they finished the first one , i mean , ass blasters ? ? ? xxmaj now , only xxmaj god himself is capable of answering the question " why in xxmaj gods name would they create another one of these dumpster dives of a movie ? " xxmaj tremors iv can not be considered a bad movie , in fact it can not be even considered an epitome of a bad movie , for it lives up to more than that . xxmaj as i attempted to sit though it , i noticed that my eyes started to bleed , and i hoped profusely that the little girl from the ring would crawl through the tv and kill me . did they really think that dressing the people who had stared in the other movies up as though they we \'re from the wild west would make the movie ( with the exact same occurrences ) any better ? honestl

### samplers
- ???

In [62]:
#export
from torch.utils.data import Sampler

class SortSampler(Sampler): # for validation set
    ''' Get indices of docs that is reverse-sorted by key (e.g. get 
    indices of the documents from longest to shortest)'''
    def __init__(self, data_source, key):
        self.data_source = data_source
        self.key = key
    def __len__(self):
        return len(self.data_source)
    def __iter__(self): # feed longest text first, return indices of the correspond texts
        return iter(sorted(list(range(len(self.data_source))), key = self.key, reverse=True))


For the training set, we want some kind of randomness on top of this. So first, we shuffle the texts and build megabatches of size `50 * bs`. We sort those megabatches by length before splitting them in 50 minibatches. That way we will have randomized batches of roughly the same length.

Then we make sure to have the biggest batch first and shuffle the order of the other batches. We also make sure the last batch stays at the end because its size is probably lower than batch size.

In [63]:
#export
# ??? needs to read more carefully
class SortishSampler(Sampler):
    '''
    Note: key is a callable function
    '''
    def __init__(self, data_source,key, bs):
        self.data_source = data_source
        self.key = key
        self.bs = bs
    def __len__(self) -> int:
        return len(self.data_source)
    def __iter__(self):
        idxs = torch.randperm(len(self.data_source))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)] # 50 times bigger megabatch
        sorted_idx = torch.cat([torch.tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches]) # reverse-sort the megabatches by key
        batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)] # extract batch indices from megabatch indices
        max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches]))  # find the chunk with the largest key,
        batches[0], batches[max_idx] = batches[max_idx],batches[0] # then make sure it goes first.
        batch_idxs = torch.randperm(len(batches)-2) #excluding begin and end
        sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else torch.LongTensor([])
        sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
        return iter(sorted_idx)
        

### Padding
- Padding token has id = 1 by default (see default_spec_tok), remember to change the pad_idx if this is changed!
- Pad each sequence **in the end** so that they all have same size when batching them together
- Padding in the end in order to use Pytorch convenience functions in future that allows us to ignore the padding
- Pad all docs to the length of the longest doc
- Longest sequences are in the first batch, all other batches are shuffled but they have similar lengths inside each batch

In [64]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False):
    ''' '''
    max_len = max([len(s[0]) for s in samples]) # s[0] is x, s[1] is y
    res = torch.zeros(len(samples), max_len).long() + pad_idx # a giant matrix with all elements = pad_idx
    for i, s in enumerate(samples):
        if pad_first:
            res[i, -len(s[0]):] = torch.LongTensor(s[0])
        else:
            res[i, :len(s[0])] = torch.LongTensor(s[0])
    return res, torch.tensor([s[1] for s in samples]) # (x, y)


In [65]:
bs = 64
# sampler is a generator
train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs = bs) # key = doc length, ll.train[int(t)] == doc #t
train_dl = DataLoader(ll.train, batch_size=bs, sampler = train_sampler, collate_fn=pad_collate)


In [67]:
iter_dl = iter(train_dl)
x,y = next(iter_dl)
x.shape, y.shape

(torch.Size([64, 3311]), torch.Size([64]))

In [68]:
# length of the texts after removing the padding
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]

([3311, 1699, 1577, 1554, 1417], 1057)

In [74]:
# the other batches are random, but inside each batch the doc sizes are similar 
# (will be pad to max doc size)
x,y = next(iter_dl)
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]


([175, 175, 175, 175, 175], 171)

In [75]:
x

tensor([[   2,    7, 3938,  ..., 1014,   92,    3],
        [   2,    7,    8,  ..., 7250,    9,    3],
        [   2,    7,   19,  ...,   29,   92,    3],
        ...,
        [   2,    7,   16,  ...,    1,    1,    1],
        [   2,   18,   87,  ...,    1,    1,    1],
        [   2,   18,  160,  ...,    1,    1,    1]])

In [76]:
#export
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
    train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
    valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
    return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))

def clas_databunchify(sd, bs, **kwargs):
    return DataBunch(*get_clas_dls(sd.train, sd.valid, bs, **kwargs))


In [77]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs)

In [78]:
data

<exp.nb_XLibrary.DataBunch at 0x1a1fc94160>

# Export

In [None]:
!python notebook2script.py test_preprocessing_Lesson6