# Knock Knock Joke Generator
## Team Compose

### Import Section

In [17]:
%reload_ext autoreload
%autoreload
%matplotlib inline

In [18]:
from fastai.learner import *
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

###  Data setup

In [19]:
# Setting up the training and validation paths
TEST_PATH = 'test/'
TRAIN_PATH = 'train/'

In [20]:
# Number of test examples
! ls -l {TEST_PATH} | wc -l

106


In [21]:
# Number of training examples
!ls -l {TRAIN_PATH} | wc -l

460


In [22]:
# Creating lists of training and test examples
train_files = !ls {TRAIN_PATH}
test_files = !ls {TEST_PATH}
train_files[:5]

['train_100.txt',
 'train_101.txt',
 'train_102.txt',
 'train_104.txt',
 'train_105.txt']

In [23]:
# Checking the first training examples together
train_file = !cat {TRAIN_PATH}{train_files[0]}
train_file

["knock, knock. who's there! arfur! arfur who? arfur got!"]

### Creating the model

In [24]:
# Setting up the torchtext field.

#required so that the tokenizer works
spacy.load('en') 
TEXT = data.Field(lower=True, tokenize='spacy')

In [66]:
# bs is the number of batches that the data gets split into
# bptt is the number of words from each batch
bs = 16
bptt = 20

In [67]:
FILES = dict(train=TRAIN_PATH, validation=TEST_PATH, test=TEST_PATH)
FILES

{'train': 'train/', 'validation': 'test/', 'test': 'test/'}

In [68]:
model_data = LanguageModelData.from_text_files("", TEXT, **FILES, bs=bs, bptt=bptt, min_freq=1)

In [69]:
# The number of batches
len(model_data.trn_dl)

28

In [70]:
# The number of vocab tokens
model_data.nt

1181

In [71]:
# Subset of the words that have been tokenized
TEXT.vocab.itos[:10]

['<unk>', '<pad>', '!', 'knock', 'who', '?', '.', ',', 'there', '<eos>']

In [72]:
TEXT.vocab.stoi['the']

12

In [221]:
embedding_matrix_vector_size = 500

In [222]:
hidden_activations_per_layer = 500

In [223]:
number_of_layers = 2

In [224]:
optimization_function = partial(optim.Adam, betas=(0.7, 0.99))

### Creating the learner

In [225]:
learner = model_data.get_model(optimization_function, 
                               embedding_matrix_vector_size,
                               hidden_activations_per_layer,
                               number_of_layers,
                               dropouti=0.05, 
                               dropout=0.05, 
                               wdrop=0.1, 
                               dropoute=0.02, 
                               dropouth=0.05)

In [226]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [227]:
# This clips the learning rate
learner.clip=0.2

In [228]:
learner.fit(3e-4, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      5.71224    5.144524  
    1      4.932596   4.472719                            
    2      4.393734   4.211653                            
    3      4.004725   3.970533                            
    4      3.707027   3.732093                            
    5      3.470103   3.746384                            
    6      3.32136    3.645633                            
    7      3.255934   3.543715                            
    8      3.122264   3.359014                            
    9      2.980833   3.295523                            
    10     2.889807   3.284526                            
    11     2.80832    3.246338                            
    12     2.751825   3.307751                            
    13     2.70912    3.243693                            
    14     2.675303   3.255986                            



[3.2559863062929515]

In [229]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      2.856145   3.139025  
    1      2.713763   3.167377                            
    2      2.569897   3.38607                             
    3      2.422053   3.432718                            
    4      2.264729   3.513473                            
    5      2.11268    3.656592                            
    6      2.007501   3.596226                            
    7      1.915398   3.566329                            
    8      1.886388   3.566361                            
    9      1.840334   3.536302                            



[3.536302279253475]

In [230]:
model = learner.model

In [243]:
ss = "who"

In [244]:
s = [TEXT.tokenize(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

'who'

In [245]:
model[0].bs = 1
model.eval()
model.reset()
res, *_ = model(t)
model[0].bs = bs

In [246]:
next_words = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_words)]

['?', '!', 'you', 'there', 'is', "'s", '’s', 'the', '"', 'your']

In [247]:
print(ss,"\n")
for i in range(100):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = model(n[0].unsqueeze(0))
print('...')

who 

? <eos> knock , knock . who 's there ! ben ! ben who ? carrie - n of the door ! <eos> knock , knock . who 's there ! ben ! ben who ? carrie - n of the door ! <eos> knock , knock . who 's there ! ben ! ben who ? carrie - n of the door ! <eos> knock , knock . who 's there ! ben ! ben who ? carrie - n of the door ! <eos> knock , knock . who 's there ! ben ! ben who ? carrie ...
