In [1]:
#export
from exp.nb_11a import *

In [2]:
torch.cuda.is_available()

True

# Data

In [3]:
path = untar_data(URLs.IMDB)

In [4]:
path.ls()

[Path('/home/sandmann/.fastai/data/imdb/ll.clas.pkl'),
 Path('/home/sandmann/.fastai/data/imdb/imdb.vocab'),
 Path('/home/sandmann/.fastai/data/imdb/ll_clas.pkl'),
 Path('/home/sandmann/.fastai/data/imdb/tmp_clas'),
 Path('/home/sandmann/.fastai/data/imdb/tmp_lm'),
 Path('/home/sandmann/.fastai/data/imdb/test'),
 Path('/home/sandmann/.fastai/data/imdb/ld.pkl'),
 Path('/home/sandmann/.fastai/data/imdb/unsup'),
 Path('/home/sandmann/.fastai/data/imdb/train'),
 Path('/home/sandmann/.fastai/data/imdb/README')]

In [5]:
#export
def read_file(fn):
    with open(fn, 'r', encoding='utf8') as f: return f.read()

In [6]:
#export
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions, recurse, include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path): return read_file(i)
        return i

In [7]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [8]:
len(il.items)

100000

In [9]:
txt = il[0]; txt

"What could have been an excellent hostage movie was totally ruined by what apparently looks like a bored director ... there were so many directions that the movie could have taken ... a vampire slash-fest was not one of these!!! The last 45 mins. or so results in the movie being an absolutely ridiculous waste of time. ...and sex machine?? ... you gotta be kidding me! The acting talents of the likes of Juliette Lewis and Harvey Keitel (not to mention George Clooney) are completely wasted in this nonsensical movie. <br /><br />The director... Robert Rodriguez, known for his other gory flicks including el mariachi, desperado, once upon a time in Mexico, and the very recent sin city ... really holds your attention with the well executed first half ... which leads you to believe that you are in for an entertaining time ... but then apparently for no reason, and without any provocation, the madness starts ... there's even feeble attempts at parody and comedy ... truly exasperating!!"

In [10]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

# Tokenizing

In [11]:
#export
import spacy
import html

In [12]:
#export
#special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [13]:
replace_rep('dddd')

' xxrep 4 d '

In [14]:
replace_wrep('word word word word word ')

' xxwrep 5 word  '

In [15]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [16]:
replace_all_caps(['I', 'AM', 'SHOUTING'])

['I', 'xxup', 'am', 'xxup', 'shouting']

In [17]:
deal_caps(['My', 'name', 'is', 'Jeremy'])

['xxmaj', 'my', 'name', 'is', 'xxmaj', 'jeremy']

In [18]:
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

In [19]:
#export
def parallel(func, arr, max_workers=4):
    if max_workers < 2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [33]:
#export
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [34]:
tp = TokenizeProcessor()

In [35]:
txt[:250]

'What could have been an excellent hostage movie was totally ruined by what apparently looks like a bored director ... there were so many directions that the movie could have taken ... a vampire slash-fest was not one of these!!! The last 45 mins. or '

In [36]:
' • '.join(tp(il[:100])[0])[:400]

█

'xxbos • xxmaj • what • could • have • been • an • excellent • hostage • movie • was • totally • ruined • by • what • apparently • looks • like • a • bored • director • ... • there • were • so • many • directions • that • the • movie • could • have • taken • ... • a • vampire • slash • - • fest • was • not • one • of • these • ! • ! • ! • xxmaj • the • last • 45 • mins • . • or • so • results • in '

# Numericalizing

In [38]:
#export 
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2):
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
        
    def __call__(self, items):
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)

        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int, {v:k for k,v in enumerate(self.vocab)})
        return [self.proc1(o) for o in items]
    
    def proc1(self, item): return [self.otoi[o] for o in item]

    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

In [39]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [40]:
%time ll = label_by_func(sd, lambda x: 0, proc_x=[proc_tok,proc_num])

CPU times: user 22.2 s, sys: 3.34 s, total: 25.6 s
Wall time: 1min 48s


In [41]:
ll.train.x_obj(0)

'xxbos xxmaj one of the problems with popular culture , especially when discussing the popular culture of the 1970s , is that mass media - especially television - is usually about four years behind \' underground \' media , primarily music . xxmaj many people think the \' xxmaj woodstock xxmaj generation " remained important throughout the 1970s ; actually , it was all over at xxmaj altamont in 1970 . xxmaj by 1972 , \' underground \' rock or the \' counterculture \' had moved east to xxmaj england and xxmaj led xxmaj zepplin , xxmaj black sabbath , and xxmaj david xxmaj bowie , early metal - heads and the so - called \' glam - rockers , \' who were all \' peace and love \' - not . xxmaj neither , in a darkly different vein , was xxmaj charles xxmaj manson \'s \' family . \' \n\n xxmaj this obvious pilot for a television show ( that , thankfully , was never picked up by the networks ) is attempting to come to terms with a culture that was already as withered as yesterday \'s flowers . 

In [42]:
pickle.dump(ll, open(path/'ld.pkl', 'wb'))

In [43]:
ll = pickle.load(open(path/'ld.pkl', 'rb'))

# Batching

In [44]:
from IPython.display import display,HTML
import pandas as pd

In [45]:
stream = """
In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface. 
First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API.
Then we will study how we build a language model and train it.\n
"""

In [46]:
tokens = np.array(tp([stream]))[0]

█

In [47]:
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,\n,xxmaj,in,this,notebook,",",we,will,go,back,over,the,example,of
classifying,movie,reviews,we,studied,in,part,1,and,dig,deeper,under,the,surface,.
\n,xxmaj,first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into
numbers,and,how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have
another,example,of,the,xxmaj,processor,used,in,the,data,block,api,.,\n,xxmaj
then,we,will,study,how,we,build,a,language,model,and,train,it,.,\n\n


In [48]:
bs,bptt = 6,5
for k in range(3):
    d_tokens = np.array([tokens[i*seq_len + k*bptt: i*seq_len + (k+1)*bptt] for i in range(bs)])
    df = pd.DataFrame(d_tokens)
    display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4
xxbos,\n,xxmaj,in,this
classifying,movie,reviews,we,studied
\n,xxmaj,first,we,will
numbers,and,how,to,customize
another,example,of,the,xxmaj
then,we,will,study,how


0,1,2,3,4
notebook,",",we,will,go
in,part,1,and,dig
look,at,the,processing,steps
it,.,xxmaj,by,doing
processor,used,in,the,data
we,build,a,language,model


0,1,2,3,4
back,over,the,example,of
deeper,under,the,surface,.
necessary,to,convert,text,into
this,",",we,'ll,have
block,api,.,\n,xxmaj
and,train,it,.,\n\n


In [49]:
#export
class LM_PreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.x])
        self.n_batch = total_len // bs
        self.batchify()
        
    def __len__(self): return ((self.n_batch-1) //self.bptt) * self.bs
    
    def __getitem__(self,idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx: seq_idx+self.bptt], source[seq_idx+1: seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.x
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)
    

In [50]:
dl = DataLoader(LM_PreLoader(ll.valid, shuffle=True), batch_size=64)

In [51]:
iter_dl = iter(dl)
x1, y1 = next(iter_dl)
x2, y2 = next(iter_dl)

In [52]:
x1.size(), y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [53]:
vocab = proc_num.vocab

In [54]:
len(vocab)

60003

In [55]:
x1[0]

tensor([   2,    7,   19,   29,   25,   12,  477,   13,   75,    9,    7,   16,
         308,  357,   10,  206, 3027,   10, 1777,  893,   10,  757, 7178,   11,
         208,   14,   42, 1003, 1288,   52,   32,  796, 7446, 7621,   12, 3468,
        7274,    9,    7,   16,   91,   35,  600,    9,   24,    7,   16,  787,
          88,   20,   52,   94,  795,   96,   42,  300,  103,   12,   29,   10,
          16,   25,  702,   70, 9801,  244,   10,   11,   41,   48])

In [56]:
" ".join(vocab[o] for o in x1[0])

"xxbos xxmaj this movie was a waste of time . xxmaj it looks nice , pretty settings , nicely acted , appears earnest and seems to be leading somewhere so you stay tuned awaiting a meaningful payoff . xxmaj it does n't happen . \n\n xxmaj it surprised me that so much effort could be put into a movie , it was clearly very professionally done , and have an"

In [57]:
" ".join(vocab[o] for o in y1[0])

"xxmaj this movie was a waste of time . xxmaj it looks nice , pretty settings , nicely acted , appears earnest and seems to be leading somewhere so you stay tuned awaiting a meaningful payoff . xxmaj it does n't happen . \n\n xxmaj it surprised me that so much effort could be put into a movie , it was clearly very professionally done , and have an outcome"

In [58]:
" ".join(vocab[o] for o in x2[0])

"outcome that seems nothing short of a b - movie . \n\n xxmaj save your precious time and see a good french film like xxmaj les xxmaj visiteurs ( funny ) , xxmaj jean de xxmaj xxunk or xxmaj manon of the xxmaj spring . i ca n't recall the language in xxmaj europa xxmaj europa , but that 's another xxmaj great film -- heavy but very worth viewing"

In [59]:
#export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=bs*2, **kwargs))

def lm_databunchify(sd, bs, bptt, **kwargs):
    return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))

In [60]:
bs,bptt = 64, 70
data = lm_databunchify(ll, bs, bptt)

# Batching for classification

In [61]:
proc_cat = CategoryProcessor()

In [62]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x=[proc_tok, proc_num], proc_y=proc_cat)

█

In [63]:
pickle.dump(ll, open(path/'ll.clas.pkl', 'wb'))

In [64]:
ll = pickle.load(open(path/'ll.clas.pkl', 'rb'))

In [65]:
(path/'train').ls()

[Path('/home/sandmann/.fastai/data/imdb/train/labeledBow.feat'),
 Path('/home/sandmann/.fastai/data/imdb/train/neg'),
 Path('/home/sandmann/.fastai/data/imdb/train/unsupBow.feat'),
 Path('/home/sandmann/.fastai/data/imdb/train/pos')]

In [66]:
[(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,1000]]

[('xxbos xxmaj note to xxmaj horror fans : xxmaj the only horror here is when you realized you just wasted 95 minutes of your life on a movie that \'s so worthless it \'s insulting . \n\n i watched this because : \n\n xxmaj the premise sounded slightly promising : xxmaj it \'s not . xxmaj it \'s just an excuse to use the same lame set pieces from other low - budget slasher films that were n\'t good either . \n\n xxmaj the promise of naked forest nymphs sounded nice even if the movie turned out to be awful : xxmaj it \'s not . xxmaj it \'s so not . xxmaj the amateur cinematography makes sure the " fallen angels " are about as sexy as the average homeless person . \n\n xxmaj the name xxmaj tom xxmaj savini has a long history in the horror genre : xxmaj he \'s the king of low - budget special effects and lower - budget acting . xxmaj come to think of it , xxmaj savini should have been a reason not to watch this movie . xxmaj it \'s not that he \'s bad , but he \'s almost always in bad mov

In [67]:
#export
from torch.utils.data import Sampler

class SortSampler(Sampler):
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __repr__(self):
        return iter(sorted(list(range(len(self))), key=self.key, reverse=True))

In [68]:
#export
class SortishSampler(Sampler):
    def __init__(self, data_source, key, bs): self.data_source,self.key,self.bs = data_source,key,bs
    def __len__(self) -> int: return len(self.data_source)
    def __iter__(self):
        idxs = torch.randperm(len(self))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
        sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
        batches = [sorted_idx[i: i+self.bs] for i in range(0, len(sorted_idx), self.bs)]
        max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches]))
        batches[0],batches[max_idx] = batches[max_idx],batches[0]
        batch_idxs = torch.randperm(len(batches) - 2)
        sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([])
        sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
        return iter(sorted_idx)

In [69]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False):
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    for i,s in enumerate(samples):
        if pad_first: res[i,  -len(s[0]):] = LongTensor(s[0])
        else:         res[i, :len(s[0])  ] = LongTensor(s[0])
        
    return res, tensor([s[1] for s in samples])

In [70]:
bs = 64
train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs=bs)
train_dl = DataLoader(ll.train, bs, sampler=train_sampler, collate_fn=pad_collate)

In [71]:
iter_dl = iter(train_dl)
x,y = next(iter_dl)

IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number

In [None]:
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]

In [None]:
x,y = next(iter_dl)
lenghts = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lenghts[:5], lengths[-1]

In [None]:
x

In [1]:
#export
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
    train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
    valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
    return (DataLoader(train_ds, bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
            DataLoader(valid_ds, bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))

def clas_databunchify(sd, bs, **kwargs):
    return DataBunch(*get_clas_dls(sd.train, sd.valid, bs, **kwargs))

In [None]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs)

In [2]:
!python notebook2script.py 12_text.ipynb





Converted 12_text.ipynb to exp/nb_12.py
