In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [1]:
import os

if 'drive' in os.listdir():
    COLAB=True
    os.chdir('drive/dl-projects/transfer-learning-quora/notebooks/')
else:
    COLAB=False

import sys
sys.path.append('..')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import torch
from fastai.text import *
from pathlib import Path
import dill as pickle
from src.dataloader import PairedDataset, PairedEmbedDataset
from sklearn.model_selection import StratifiedShuffleSplit

DATA = Path('../data/')
LM = DATA/'language-model/'
CLS = DATA/'classifier/'
TMP = DATA/'tmp'

if COLAB:  
    WT = Path('/content/models/wt103/')
else:
    WT = DATA/'wt/models/wt103/'

### 1. Load vocabulary and classifier data.

In [4]:
# Tokenized text
qntok = np.load(CLS/'cls_trn.npy')
qnlabels = np.load(CLS/'cls_trn_lbl.npy').astype(np.float32)

# Split into training and validation sets.
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.1, random_state=0)
trn_idx, val_idx = next(iter(sss.split(qntok, qnlabels)))

trn, trn_lbl = qntok[trn_idx], qnlabels[trn_idx]
val, val_lbl = qntok[val_idx], qnlabels[val_idx]

# Load vocab
itos = pickle.load((DATA/'itos.p').open('rb'))
stoi = pickle.load((DATA/'stoi.p').open('rb'))
vs = len(itos)

### 2. Make DataSets and DataLoaders

In [5]:
q1_trn, q2_trn = trn[:, 0], trn[:, 1]
q1_val, q2_val = val[:, 0], val[:, 1]
lbl_trn = trn_lbl
lbl_val = val_lbl

In [6]:
em_sz=400
vs=len(itos)

In [7]:
class PairDataset(Dataset):
    def __init__(self, X, y): self.x1,self.x2,self.y = X[0],X[1],y
    def __getitem__(self, idx): return A(self.x1[idx], self.x2[idx], (T(self.y[idx]).float()))
    def __len__(self): return len(self.x1)

In [8]:
trn_ds = PairDataset(X=[q1_trn,q2_trn],y=(lbl_trn).T)
val_ds = PairDataset(X=[q1_val,q2_val],y=(lbl_val).T)

In [9]:
trn_ds.__getitem__(15)

[array([   4,  421,   12,  154,  512,   10,  340,   50, 1845,  217,  699,  528,    2]),
 array([   4,  421,   12,    0,   10,  340,   50, 1845,  217,  699,  528,    2]),
 array([1.], dtype=float32)]

In [10]:
bs=170

In [11]:
trn_dl = DataLoader(trn_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=val_samp)
md = ModelData(TMP, trn_dl, val_dl)

In [12]:
it = iter(trn_dl)
its = [next(it) for i in range(5)]
[(len(x1),len(x2),len(y)) for x1,x2,y in its]
[((x1.shape),(x2.shape),(y.shape)) for x1,x2,y in its]

[(torch.Size([37, 170]), torch.Size([51, 170]), torch.Size([170, 1])),
 (torch.Size([63, 170]), torch.Size([37, 170]), torch.Size([170, 1])),
 (torch.Size([35, 170]), torch.Size([47, 170]), torch.Size([170, 1])),
 (torch.Size([36, 170]), torch.Size([39, 170]), torch.Size([170, 1])),
 (torch.Size([35, 170]), torch.Size([35, 170]), torch.Size([170, 1]))]

### Classifier

In [13]:
bptt,em_sz,nh,nl = 20,400,1150,3
vs = len(itos)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [14]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.3

In [15]:
class PairBatchRNN(RNN_Encoder):
    def __init__(self, bptt, max_seq, *args, **kwargs):
        self.max_seq,self.bptt = max_seq,bptt
        super().__init__(*args, **kwargs)

    def concat(self, arrs):
        return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]

    def forward(self, input):
        #print (input[0].shape)
        sl,bs = input[0].size()
        for l in self.hidden:
            for h in l: h.data.zero_()
        raw_outputs0, raw_outputs1, outputs0, outputs1 = [],[],[],[]
        r0, o0 = super().forward(input[0])
        r1, o1 = super().forward(input[1])
        raw_outputs0.append(r0)
        raw_outputs1.append(r1)
        outputs0.append(o0)
        outputs1.append(o1)
        return self.concat(raw_outputs0), self.concat(raw_outputs1), self.concat(outputs0), self.concat(outputs1)

In [40]:
class LinearComparator(nn.Module):
    initrange=0.1
    def __init__(self, n_out, nhid, dropout, tie_encoder=None):
        super().__init__()
        self.decoder = nn.Linear(nhid, 200, bias=False)
        self.decoder2 = nn.Linear(200, 50, bias=False)
        self.decoder3 = nn.Linear(100, 1, bias=False)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.decoder2.weight.data.uniform_(-self.initrange, self.initrange)
        self.decoder3.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = LockedDropout(dropout)
        self.bn = nn.BatchNorm1d(200)
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input):
        raw_outputs0, raw_outputs1, outputs0, outputs1 = input
        output0 = self.dropout(outputs0[-1])
        output1 = self.dropout(outputs1[-1])
        
        decoded0 = self.decoder(output0[-1].view(output0.size(1), output0.size(2)))
        decoded1 = self.decoder(output1[-1].view(output1.size(1), output1.size(2)))
        result0 = F.relu(decoded0.view(decoded0.size(1),-1))
        result1 = F.relu(decoded1.view(decoded1.size(1),-1))
        
        result0 = F.relu(self.decoder2(self.bn(result0.view(result0.size(1),-1))))
        result1 = F.relu(self.decoder2(self.bn(result1.view(result1.size(1),-1))))
        
        result0 = self.decoder3(torch.cat((result0,result1),1))
        return result0, raw_outputs1, outputs1
    
class MultiSequentialRNN(SequentialRNN):
    def forward(self, *input):
        return super().forward(input)

In [41]:
#need to clean-up unwanted parameters
def get_rnn_comparator(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
    rnn_enc = PairBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return MultiSequentialRNN(rnn_enc, LinearComparator(n_tok, emb_sz, dropout=0.2, tie_encoder=None))

In [42]:
#need to clean-up unwanted parameters
m = get_rnn_comparator(bptt, 20*70, 0, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, 0], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [43]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [44]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
#learn.crit=binary_loss_crit
learn.crit=nn.BCEWithLogitsLoss()
learn.clip=.25
learn.metrics = [accuracy_thresh(0.5)]

In [45]:
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [46]:
wd = 1e-7
wd = 0
learn.load_encoder('lm1_enc')

In [47]:
learn.freeze_to(-1)

In [48]:
# learn.lr_find(start_lr=lrs*5, end_lr=lrs*50, linear=False)
# learn.sched.plot()

In [53]:
learn.fit(lrs/10, 1, wds=wd, cycle_len=1, use_clr=(32,3))

HBox(children=(IntProgress(value=0, description='Epoch: ', max=1), HTML(value='')))

epoch      trn_loss   val_loss   <lambda>                      
    0      0.640731   0.639699   0.632123  



[array([0.6397]), 0.6321229759531021]

In [55]:
learn.unfreeze()

In [56]:
learn.fit(lrs, 1, wds=wd, cycle_len=14, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch: ', max=14), HTML(value='')))

  0%|          | 1/2141 [00:01<46:45,  1.31s/it, loss=0.687]


RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1518244507981/work/torch/lib/THC/generic/THCStorage.cu:58

In [9]:
bs=48

In [10]:
class PairedDataset(Dataset):
    def __init__(self, x, y):
        self.x, self.y = x, y
        
    def __getitem__(self, i):
        return A(self.x[i, 0], (self.x[i, 1]), self.y[i])
    
    def __len__(self):
        return self.y.shape[0]

In [38]:
# Make datasets and dataloaders
trn_ds = PairedDataset(trn, trn_lbl)
val_ds = PairedDataset(val, val_lbl)

# transpose_y is currently a hack to transpose the second question as well. Submit PR to fastai!
trn_samp = SortishSampler(trn_ds, key=lambda x: trn_ds[x][0].shape[0], bs=bs)
val_samp = SortSampler(val_ds, key=lambda x: trn_ds[x][0].shape[0])

trn_dl = DataLoader(trn_ds, bs, num_workers=1, pad_idx=1, transpose=True, 
                    transpose_y=True, pre_pad=True, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, num_workers=1, pad_idx=1, transpose=True, 
                    transpose_y=True, pre_pad=True, sampler=val_samp)

In [39]:
x1,x2,y = next(iter(trn_dl))

In [40]:
j=3
' '.join([itos[i] for i in x1[:, j]]), ' '.join([itos[i] for i in x2[:, j]])

('_pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ lately , most of the inspiring success stories carry a certain trend - the neglect for proper / formal education . is this something worth sharing simply because most millionaires and billionaires nowadays are uneducated ? does this mean formal education is not necessary ?',
 '_pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ are there nowadays scientists without formal education ( autodidacts ) ? is it possible to do scientific research without following a formal education ?')

### 2. Load End-to-End Similarity Net

In [44]:
class MultiTwoBatchRNN(MultiBatchRNN):
    def __init__(self, bptt, max_seq, *args, **kwargs):
        super().__init__(bptt, max_seq, *args, **kwargs)
        super().reset()
        
    def forward(self, input):
        raw_outputs1, outputs1 = super().forward(input[0])
        raw_outputs2, outputs2 = super().forward(input[1])

        return raw_outputs1, outputs1, raw_outputs2, outputs2 


class PoolingDiffLinearClassifier(PoolingLinearClassifier):
#     def get_final_embedding(self, outputs):
#         output = outputs[-1]
#         sl, bs, _ = output.size()
#         avgpool = self.pool(output, bs, False)
#         mxpool = self.pool(output, bs, True)

#         return torch.cat([output[-1], mxpool, avgpool], 1)
#         return output[-1]
    
    def forward(self, inp):
        raw_outputs1, outputs1, raw_outputs2, outputs2 = inp
#         x1 = self.get_final_embedding(outputs1)
#         x2 = self.get_final_embedding(outputs2)
        x1, x2 = outputs1[-1][-1], outputs2[-1][-1]
        x = (x1 - x2).abs()

        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
            
        return l_x.view(-1)


class MultiSequentialRNN(SequentialRNN):
    def forward(self, *input):
        super().forward(input)

    
def get_question_siml_model(bptt, max_seq, n_tok, emb_sz, n_hid, nlayers, pad_token, layers, 
                            drops, bidir=False, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
                           
    birnn_enc = MultiTwoBatchRNN(bptt, max_seq, n_tok, emb_sz=emb_sz, nhid=n_hid, nlayers=nlayers, 
                                 pad_token=pad_token, bidir=bidir,
                                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    
    sim_net = PoolingDiffLinearClassifier(layers, drops)
    
    return MultiSequentialRNN(birnn_enc, sim_net)

In [45]:
# Model config
bptt = 70
pad_tok = 1
em_sz, nh, nl = 400, 1150, 3
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5


# Make model data
md = ModelData(TMP, trn_dl, val_dl)

# Similarity net
simnet = get_question_siml_model(bptt, 20*70, vs, em_sz, nh, nlayers=nl, 
                                 pad_token=1, layers=[em_sz*1, 200, 50, 1], 
                                 drops=[dps[4], 0.15, 0.1], dropouti=dps[0],
                                 wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
learn = RNN_Learner(md, TextModel(to_gpu(simnet)), crit=F.binary_cross_entropy_with_logits, 
                    opt_fn=optim.Adam, metrics=[accuracy_thresh(0.5)])

learn.load_encoder('lm1_enc')
learn.freeze_to(-1)

In [54]:
learn.freeze_to??

### 3. Start Training

#### Only head

In [None]:
wd=1e-7
learn.lr_find(1e-5, 5, wds=wd)

HBox(children=(IntProgress(value=0, description='Epoch: ', max=1), HTML(value='')))

 61%|██████▏   | 4660/7581 [23:32<08:32,  5.70it/s, loss=0.647]

In [None]:
learn.sched.plot()

In [27]:
learn.fit(1e-5, 1, cycle_len=1, wds=wd, use_clr=(10, 3))

NameError: name 'wd' is not defined

In [None]:
learn.save('clas_lst')

In [None]:
learn.sched.plot_loss()

#### End-to-end

In [10]:
# learn.load('clas_lst')

In [10]:
learn.unfreeze()
learn.clip=0.25

In [13]:
wd=1e-9
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

# learn.lr_find(lrs/1000, 5, wds=wd)
# learn.sched.plot()

In [None]:
learn.fit(5e-3, 1, cycle_len=8, wds=wd, use_clr=(30, 10))

HBox(children=(IntProgress(value=0, description='Epoch: ', max=8), HTML(value='')))

    1      0.59468    0.588937   0.63305                         
 86%|████████▌ | 6533/7581 [1:19:35<12:43,  1.37it/s, loss=0.584]

In [None]:
nn_evaluate(md.val_dl, learn.model)

In [None]:
learn.lr_find(1e-9, 2, wds=wd)

In [None]:
learn.sched.plot()