In [0]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/bb/bb/041115d8bad1447080e5d1e30097c95e4b66e36074277afce8620a61cee3/allennlp-0.9.0-py3-none-any.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 3.1MB/s 
[?25hCollecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/ec/d8/5e877ac5e827eaa41a7ea8c0dc1d3042e05d7e337604dc2aedb854e7b500/ftfy-5.7.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 9.2MB/s 
[?25hCollecting pytorch-transformers==1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/89/ad0d6bb932d0a51793eaabcf1617a36ff530dc9ab9e38f765a35dc293306/pytorch_transformers-1.1.0-py3-none-any.whl (158kB)
[K     |████████████████████████████████| 163kB 54.2MB/s 
Collecting flask-cors>=3.0.7
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Collecting word2number>=1.1
  Downloading https://files

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchtext import data

import re

In [0]:
from allennlp.modules.elmo import Elmo

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

100%|██████████| 336/336 [00:00<00:00, 418931.67B/s]
100%|██████████| 374434792/374434792 [00:08<00:00, 44401828.66B/s]


In [0]:
#device = tt.device('cuda') if tt.cuda.is_available() else tt.device('cpu')

In [0]:
import spacy
from spacy.symbols import ORTH

spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])

In [0]:
def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [0]:
def get_news(text):
  text = re.sub(r'([<| ].*?(\.edu)|(\.com)|(\.se)(>)?)', '', text)
  text = re.sub('\W+', ' ', text)
  return text

In [0]:
news = fetch_20newsgroups(subset='all')

In [0]:
df = pd.Series(news.data)
df = pd.DataFrame(df) 
df.columns = ['Data'] + df.columns.tolist()[1:]
df['target'] = pd.Series(news.target)

In [0]:
df['Data'] = df.apply(lambda row: get_news(row['Data']), axis=1)

In [0]:
df.head()

Unnamed: 0,Data,target
0,From Subject Pens fans reactions Organization ...,10
1,From Matthew B Lawson Subject Which high perfo...,3
2,From hilmi er dsv su Hilmi Eren Subject Re ARM...,17
3,From guyd austin ibm Guy Dawson Subject Re IDE...,3
4,From Subject driver Organization Sophomore Mec...,4


In [0]:
train, test, y_train, y_test = train_test_split(df.Data.values, df.target.values, test_size=0.2, random_state=42)

train, valid, y_train, y_valid = train_test_split(train, y_train, test_size=0.2, random_state=42)

In [0]:
#X_train = batch_to_ids(train.Data.values)
#y_train = tt.from_numpy(df_train.target.values).float()

#X_valid = batch_to_ids(valid.Data.values)
#y_valid = tt.from_numpy(valid.target.values).float()

#X_test = batch_to_ids(test.Data.values)
#y_test = tt.from_numpy(test.target.values).float()

In [0]:
def get_ids(data):
    word2id = {}
    id2word = {}
    i = 1
    max_len = 0
    
    for dtype in data:
        for comment in dtype:
            tokens = tokenizer(comment)
            if len(tokens) > max_len:
                max_len = len(tokens)
            
            for t in tokens:
                if not t in word2id:
                    word2id[t] = i
                    id2word[i] = t
                    i += 1
                    
    return word2id, id2word, max_len

In [0]:
def batch_to_ids(dtype, word2id, max_len):
    list_of_ids = []
    
    for comment in tqdm(dtype):
        comment_ids = []
        tokens = tokenizer(comment)
        
        for t in tokens:
            comment_ids.append(word2id[t])
        while len(comment_ids) < max_len:
            comment_ids.append(0)
        list_of_ids.append(comment_ids)
        
    return list_of_ids

In [0]:
word2id, id2word, max_len = get_ids((train, test, valid))

In [0]:
train_id = batch_to_ids(train, word2id, max_len)
valid_id = batch_to_ids(valid, word2id, max_len)
test_id = batch_to_ids(test, word2id, max_len)

100%|██████████| 12060/12060 [01:39<00:00, 121.15it/s]
100%|██████████| 3016/3016 [00:26<00:00, 113.92it/s]
100%|██████████| 3770/3770 [00:31<00:00, 118.69it/s]


In [0]:
def triplets(data, y):
    positive = []
    negative = []
    for i, comment in enumerate(data):
        positive.append(data[np.random.choice(np.where(y == y[i])[0])])
        negative.append(data[np.random.choice(np.where(y != y[i])[0])])
    
    return tt.tensor(positive), tt.tensor(negative)

In [0]:
tt.cuda.empty_cache()

batch_size = 32

pos, neg = triplets(test_id, y_test)
test_loader = DataLoader(TensorDataset(tt.tensor(test_id), pos, neg), batch_size=batch_size, drop_last=True)

pos, neg = triplets(train_id, y_train)
train_loader = DataLoader(TensorDataset(tt.tensor(train_id), pos, neg), batch_size=batch_size, drop_last=True)

pos, neg = triplets(valid_id, y_valid)
valid_loader = DataLoader(TensorDataset(tt.tensor(valid_id), pos, neg), batch_size=batch_size, drop_last=True)

In [0]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)


class Tripletnet(nn.Module): 
    def __init__(self, vocab_size, embed_size, criterion, n_classes):
        super(Tripletnet, self).__init__()
        self.n_classes = n_classes
        self.embedding = nn.Embedding(vocab_size, embed_size)
        #self.elmo = elmo
        self.criterion = criterion
        #self.relu = nn. ReLU()
        #self.fc = nn.Linear(1024*2, 128)                          
        self.fc = nn.Sequential(nn.Linear(embed_size, 128*3),
                                nn.ReLU(),
                                nn.Linear(128*3, 128*3),
                                nn.ReLU(),
                                nn.Linear(128*3, n_classes)
                                )
        #self.out = nn.Linear(128*3, n_classes)

    def branch(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        #x = self.elmo(x)
        #x = tt.cat(x, dim=-1)
        #x = x.mean(dim=1)
        x = self.fc(x)

        return x       
        
    def forward(self, batch):     
        anchor = self.branch(batch[0])
        pos = self.branch(batch[1])
        neg = self.branch(batch[2])       
        
        return triplet_loss(anchor, pos, neg)


In [0]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        loss = model(batch)
        loss = tt.abs(tt.mean(loss))
        #loss.mean().backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch)
            loss = tt.abs(tt.mean(loss))
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [0]:
n_classes = df.target.unique().shape[0]

In [0]:
n_classes

20

In [0]:
tt.cuda.empty_cache()

model = Tripletnet(len(word2id)+1, 512, nn.BCEWithLogitsLoss(), n_classes)
#model = Tripletnet(elmo, nn.BCEWithLogitsLoss(), n_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
#model.to(device)

In [0]:
nn_train(model, train_loader, valid_loader, optimizer, n_epochs=2)

epoch 0: 100%|██████████| 376/376 [02:52<00:00,  2.21it/s, loss=0.00003]
epoch 1:   0%|          | 0/376 [00:00<?, ?it/s]

validation loss 0.00001


epoch 1: 100%|██████████| 376/376 [02:51<00:00,  2.20it/s, loss=0.00003]


validation loss 0.00001


In [0]:
for instance in list(tqdm._instances): 
    tqdm._decr_instances(instance)