In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os

if 'drive' in os.listdir():
    COLAB=True
    os.chdir('drive/dl-projects/transfer-learning-quora/notebooks/')
else:
    COLAB=False

import sys
sys.path.append('..')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import torch
from fastai.text import *
from pathlib import Path
import dill as pickle
from src.dataloader import PairedDataset, PairedEmbedDataset
from sklearn.model_selection import StratifiedShuffleSplit

DATA = Path('../data/')
LM = DATA/'language-model/'
CLS = DATA/'classifier/'
TMP = DATA/'tmp'

if COLAB:  
    WT = Path('/content/models/wt103/')
else:
    WT = DATA/'wt/models/wt103/'

### 1. Load vocabulary and classifier data.

In [4]:
# Tokenized text
qntok = np.load(CLS/'cls_trn.npy')
qnlabels = np.load(CLS/'cls_trn_lbl.npy').astype(np.float32)

# Split into training and validation sets.
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.1, random_state=0)
trn_idx, val_idx = next(iter(sss.split(qntok, qnlabels)))

trn, trn_lbl = qntok[trn_idx], qnlabels[trn_idx]
val, val_lbl = qntok[val_idx], qnlabels[val_idx]

# Load vocab
itos = pickle.load((DATA/'itos.p').open('rb'))
stoi = pickle.load((DATA/'stoi.p').open('rb'))
vs = len(itos)

### 2. Make DataSets and DataLoaders

In [5]:
bs=48

In [6]:
class PairedDataset(Dataset):
    def __init__(self, x, y):
        self.x, self.y = x, y
        
    def __getitem__(self, i):
        return A(self.x[i, 0], (self.x[i, 1]), self.y[i])
    
    def __len__(self):
        return self.y.shape[0]

In [7]:
# Make datasets and dataloaders
trn_ds = PairedDataset(trn, trn_lbl)
val_ds = PairedDataset(val, val_lbl)

# transpose_y is currently a hack to transpose the second question as well. Submit PR to fastai!
trn_dl = DataLoader(trn_ds, bs, num_workers=1, pad_idx=1, shuffle=True, transpose=True, 
                    transpose_y=True, pre_pad=False)
val_dl = DataLoader(val_ds, bs, num_workers=1, pad_idx=1, shuffle=False, transpose=True, 
                    transpose_y=True, pre_pad=False)

### 2. Load End-to-End Similarity Net

In [8]:
class MultiTwoBatchRNN(MultiBatchRNN):
    def __init__(self, bptt, max_seq, *args, **kwargs):
        super().__init__(bptt, max_seq, *args, **kwargs)
        super().reset()
        
    def forward(self, input):
        raw_outputs1, outputs1 = super().forward(input[0])
        raw_outputs2, outputs2 = super().forward(input[1])

        return raw_outputs1, outputs1, raw_outputs2, outputs2 


class PoolingDiffLinearClassifier(PoolingLinearClassifier):
    def get_final_embedding(self, outputs):
        output = outputs[-1]
#         sl, bs, _ = output.size()
#         avgpool = self.pool(output, bs, False)
#         mxpool = self.pool(output, bs, True)

#         return torch.cat([output[-1], mxpool, avgpool], 1)
        return output[-1]
    
    def forward(self, x):
        raw_outputs1, outputs1, raw_outputs2, outputs2 = x
        x1 = self.get_final_embedding(outputs1)
        x2 = self.get_final_embedding(outputs2)
        x = (x1 - x2).abs()

        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
            
        return l_x.view(-1)


class MultiSequentialRNN(SequentialRNN):
    def forward(self, *input):
        for module in self._modules.values():
            input = module(input)
        return input

    
def get_question_siml_model(bptt, max_seq, n_tok, emb_sz, n_hid, nlayers, pad_token, layers, 
                            drops, bidir=False, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
                           
    birnn_enc = MultiTwoBatchRNN(bptt, max_seq, n_tok, emb_sz=emb_sz, nhid=n_hid, nlayers=nlayers, 
                                 pad_token=pad_token, bidir=bidir,
                                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    
    sim_net = PoolingDiffLinearClassifier(layers, drops)
    
    return MultiSequentialRNN(birnn_enc, sim_net)

In [9]:
# Model config
bptt = 70
pad_tok = 1
em_sz, nh, nl = 400, 1150, 3
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5


# Make model data
md = ModelData(TMP, trn_dl, val_dl)

# Similarity net
simnet = get_question_siml_model(bptt, 20*70, vs, em_sz, nh, nlayers=nl, 
                                 pad_token=1, layers=[em_sz*1, 200, 50, 1], 
                                 drops=[dps[4], 0.15, 0.1], dropouti=dps[0],
                                 wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
learn = RNN_Learner(md, TextModel(to_gpu(simnet)), crit=F.binary_cross_entropy_with_logits, 
                    opt_fn=optim.Adam, metrics=[accuracy_thresh(0.5)])

learn.load_encoder('lm1_enc')
learn.freeze_to(-1)

### 3. Start Training

#### Only head

In [None]:
wd=1e-7
learn.lr_find(1e-5, 5, wds=wd)

HBox(children=(IntProgress(value=0, description='Epoch: ', max=1), HTML(value='')))

 61%|██████▏   | 4660/7581 [23:32<08:32,  5.70it/s, loss=0.647]

In [None]:
learn.sched.plot()

In [27]:
learn.fit(1e-5, 1, cycle_len=1, wds=wd, use_clr=(10, 3))

NameError: name 'wd' is not defined

In [None]:
learn.save('clas_lst')

In [None]:
learn.sched.plot_loss()

#### End-to-end

In [10]:
# learn.load('clas_lst')

In [10]:
learn.unfreeze()
learn.clip=0.25

In [13]:
wd=1e-9
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

# learn.lr_find(lrs/1000, 5, wds=wd)
# learn.sched.plot()

In [14]:
learn.fit(5e-3, 1, cycle_len=8, wds=wd, use_clr=(30, 10))

HBox(children=(IntProgress(value=0, description='Epoch: ', max=8), HTML(value='')))

    1      0.59468    0.588937   0.63305                         
    2      0.604269   0.784184   0.688492                        
  3%|▎         | 263/7581 [03:11<1:23:44,  1.46it/s, loss=0.59] 

KeyboardInterrupt: 

In [None]:
nn_evaluate(md.val_dl, learn.model)

In [None]:
learn.lr_find(1e-9, 2, wds=wd)

In [None]:
learn.sched.plot()