# Assignment 8

Develop a model for 20 news groups dataset. Select 20% of data for test set.  

Use metric learning with siamese networks and triplet loss.   
Use KNN and LSH (`annoy` library) for final prediction after the network was trained.

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors. 

Your quality = accuracy score

In [1]:
import annoy
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
import pandas as pd
import gensim
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader, Dataset, TensorDataset

from torchtext import data

SEED = 42
np.random.seed(SEED)



In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# retrieve dataset
data = fetch_20newsgroups(subset='train')
data_test = fetch_20newsgroups(subset='test')

X = np.array(data['data'])
y = data['target']

X_test = np.array(data_test['data'])
ytest = data_test['target']

X.shape, y.shape

((11314,), (11314,))

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

X = [' '.join(stemmer.stem(word) for word in tokenizer.tokenize(post)) for post in X]
X_test = [' '.join(stemmer.stem(word) for word in tokenizer.tokenize(post)) for post in X_test]

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, stop_words='english', ngram_range=(1,3), max_features=50000)

X = tfidf.fit_transform(X)
X_test = tfidf.fit_transform(X_test)

y = np.array(y)
ytest = np.array(ytest)

X.shape, y.shape, X_test.shape, ytest.shape

((11314, 50000), (11314,), (7532, 50000), (7532,))

In [6]:
y_train = tt.from_numpy(y).float()

In [14]:
from allennlp.modules.elmo import Elmo, batch_to_ids

ModuleNotFoundError: No module named 'allennlp'

In [11]:
third = X.shape[0]//3
xq1_train = batch_to_ids(X[:third,])
xq2_train = batch_to_ids(X[third + 1:third*2,])
xq3_train = batch_to_ids(X[third*2 +1:,])

NameError: name 'batch_to_ids' is not defined

In [None]:
batch_size = 32
train_loader = DataLoader(TensorDataset(xq1_train, xq2_train, xq3_train, y_train), batch_size=batch_size)

Надо сделать сиамскую сеть с тремя инпутами и считать loss по формуле triplet loss (расстояние между anchor и positive в квадрате) минус ( расстояние между anchor и negative в квадрате ) плюс margin альфа, где margin говорит, насколько сильно должны отличаться positive от negative). 

https://arxiv.org/pdf/1412.6622.pdf

In [14]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return tt.max(tt.tensor(0.0), tt.sum(F.cosine_similarity(anchor, neg)) - tt.sum(F.cosine_similarity(anchor, pos)) + tt.tensor(0.4))

# F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)
    
    
class Tripletnet(nn.Module):
    def __init__(self, criterion):
        super(Tripletnet, self).__init__()
        self.criterion = criterion
        self.fc = nn.Linear(11314, 128)
        
    def branch(self, x):
        x = self.fc(x)
        return x

    def forward(self, anchor, pos, neg):
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        return triplet_loss(anchor, pos, neg)

In [12]:
# some routines

def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [15]:
model = Tripletnet(triplet_loss)

optimizer = optim.Adam(model.parameters())

nn_train(model, train_loader, triplet_loss, optimizer, n_epochs=2)

NameError: name 'loader' is not defined