In [3]:
import sys
sys.path.append('..')

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [168]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from fastai.text import *
from src.dataloader import DatasetStream
from src.models import ArCosModel

### 1. Datasets and Dataloaders

Link to download USE embeddings: 
[Train](https://drive.google.com/file/d/1g2izNAlAnszU_PNALYQJWUqHWoBCmfM9/view?usp=sharing)

In [9]:
# Path containing USE embeddings
DATA=Path('../data/universal-sentence-encoder/')

# Config
bs=48

In [10]:
# Because of large memory requirements of the array, we will use memory mapped
# numpy arrays.

qnemb = np.load(str(DATA/'qnemb.npy'), mmap_mode='r').astype(np.float32)
qnlabels = np.load(str(DATA/'qnlabels.npy'), mmap_mode='r').astype(np.float32)

# Get CV idxs
total = qnlabels.shape[0]
idxs = np.array([i for i in range(total)])
trn_idxs, val_idxs = train_test_split(idxs, test_size=0.1, random_state=0)

# Make Datasets
trn_ds = DatasetStream(qnemb, qnlabels, trn_idxs)
val_ds = DatasetStream(qnemb, qnlabels, val_idxs)

# Make Dataloaders
trn_dl = DataLoader(trn_ds, batch_size=bs, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=bs, shuffle=False)

# Create fast.ai Model Data Object
md = ModelData('.', trn_dl, val_dl)

In [28]:
# View a sample
x, y = next(iter(md.trn_dl))
print(x.shape, y.shape)

torch.Size([48, 2, 512]) torch.Size([48])


The middle two dimensions correspond to different embedding for the question pairs.

### 2. Baseline Model - ArCos

Similar to the distance formulation used in the original "Universal Sentence Encoder" paper for judging similarity between pairs of sentences, as baseline we first compute the cosine similarity between vectors of the question pairs and then apply arccos to convert it into an angular distance. We then map the distances to probabilities using a Logistic Regression Classifier.

In [169]:
marcos = ArCosModel(trn_dl, val_dl, class_weight="balanced")
marcos.fit()
marcos.evaluate()

Completed 1 batches
Completed 2001 batches
Completed 4001 batches
Completed 6001 batches
Completed all batches!


[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  1.0min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min finished


Completed 1 batches
Completed all batches!
Accuracy:  0.7180489252764105
Negative Log loss:  0.537030617229838
             precision    recall  f1-score   support

        0.0       0.84      0.68      0.75     25520
        1.0       0.59      0.78      0.67     14909

avg / total       0.75      0.72      0.72     40429



### 3. Feedforward Neural Net