In [1]:
import warnings
warnings.filterwarnings('ignore')

from TGA.utils import Dataset

from tqdm.notebook import tqdm
from TGA.utils import preprocessor
import copy

from time import time
import numpy as np
from itertools import repeat
from collections import Counter
from segtok import tokenizer as tk

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [4]:
dataset = Dataset('/home/datasets/webkb/')
fold = next(dataset.get_fold_instances(10, with_val=True))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val'), 6553)

In [5]:
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, mindf=2, stopwords='remove', model='list', lan='english', verbose=False):
        super(Tokenizer, self).__init__()
        self.mindf = mindf
        self.le = LabelEncoder()
        self.verbose = verbose
        self.stopwords = stopwords
        self.stopwordsSet = stop_words
        self.lan = lan
        self.model = model
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor).build_analyzer()
        #self.analyzer = tk.web_tokenizer
    
    def fit(self, X, y):
        self.N = len(X)
        self.le.fit( y )
        self.n_class = len(self.le.classes_)

        self.term_freqs = Counter()
        docs = map(self.analyzer, X)
        for doc_in_terms in tqdm(docs, total=self.N, disable=not self.verbose):
            doc_in_terms = list(map( self._filter_fit_, doc_in_terms ))
            self.term_freqs.update(list(set(doc_in_terms)))
        self.node_mapper      = {'<BLANK>': 0}
        self.term_freqs       = { term:v for (term,v) in self.term_freqs.items() if v >= self.mindf }    
        self.node_mapper      = { term:self._get_idx_(term) for term in self.term_freqs.keys() if self._isrel_(term) }
        self.node_mapper['<UNK>'] = len(self.node_mapper)
        self.node_mapper['<BLANK>'] = 0
        self.vocab_size = len(self.node_mapper)
        
        return self
    def _isrel_(self, term):
        if self.stopwords == 'remove' and term in self.stopwordsSet:
            return False
        # put here your filter_functions
        return True
    def _get_idx_(self, term):
        # put here your idx_set_functions
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            print('is stop', term)
            return self.node_mapper.setdefault('<STPW>', len(self.node_mapper))
        return self.node_mapper.setdefault(term, len(self.node_mapper))
    def _filter_transform_(self, term):
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            return '<STPW>'
        if term not in self.node_mapper:
            return '<UNK>'
        return term
    def _filter_fit_(self, term):
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            return '<STPW>'
        return term
    def _model_(self, doc):
        if self.model == 'set':
            return set(doc)
        return list(doc)
    def transform(self, X, verbose=None):
        verbose = verbose if verbose is not None else self.verbose
        n = len(X)
        doc_off = [0]
        terms_idx = []
        for i,doc_in_terms in tqdm(enumerate(map(self.analyzer, X)), total=n, disable=not verbose):
            doc_in_terms = filter( self._isrel_, doc_in_terms )
            doc_in_terms = map( self._filter_transform_, doc_in_terms )
            doc_in_terms = self._model_(doc_in_terms)
            doc_in_terms = [ self.node_mapper[tid] for tid in doc_in_terms ]
            if self.model == 'sorted':
                doc_in_terms = sorted(doc_in_terms)
            doc_off.append( len(doc_in_terms) )
            terms_idx.extend( doc_in_terms )
        return np.array( terms_idx ), np.array(doc_off)[:-1].cumsum()

In [6]:
tokenizer = Tokenizer(mindf=1, stopwords='keep', model='set', verbose=True)
tokenizer.fit(fold.X_train, fold.y_train)

  0%|          | 0/6553 [00:00<?, ?it/s]

Tokenizer(mindf=1, model='set', stopwords='keep', verbose=True)

In [7]:
y_train = tokenizer.le.transform( fold.y_train )
y_val   = tokenizer.le.transform( fold.y_val )
y_test  = tokenizer.le.transform( fold.y_test )

In [8]:
tokenizer.transform(fold.X_val[:2])

  0%|          | 0/2 [00:00<?, ?it/s]

(array([ 1427,    19,    38,  3165,  3221,  5618,    93,    95,  5017,
        17099,  6651, 21694,   809,  6580,  2402,    28, 10396,   426,
           45,    74,  5623,  4632,    75,   747,   235,   271, 11071,
          115,  6329,    14,  1719,   122, 19692,   125,  8617, 14274,
         1225,   129,   446,   307,   329,   211,    17,   476,  1095,
        11770,  7110,  2232,    93,     3,  1125,  1019, 23061,  5452,
        21403,  1589,  1021, 13806,  2142,  6867,  2500,  4847,   484,
         3436, 20145,   893,  2600,   144, 11996,  4725, 14190,   201,
         2012,  3980, 15824, 10603,  1112,   467,   203,  6134,  5481,
         5045,  8366, 10605,  3238,  7252,    15,  2246,   509,   511,
         7254,  2398,  1425,  7474,   282,    19,  1150,  2250,  6700,
         1751,  1972,  4409,  7137,  2255,  3785,  8187,    25,  1038,
         5418, 10127,  1100,  1620,    28,   103, 10610,   537,  6117,
         4615, 10182,  3659,   429,   921,  2100,  4807, 11614,  8709,
      

In [9]:
tokenizer.node_mapper['<BLANK>'], fold.X_val[0]

(0,
 'frank tropschuh 1 512 459 cs utexas usa 10 3 10 3 1071 clayton ln apt 333 15 austin tx 4310 curriculum vitae english deutsch links carnegie mellon university 1991 1995 undergraduate student school computer science universita erlangen nu 1993 1994 junior year abroad institut fu und department computer science iv operating systems university texas austin 1995 1996 graduate student department computer science frank tropschuh cs utexas \n')

In [10]:
def collate_train(param):
    X, y = zip(*param)
    terms_ids, docs_offsets = tokenizer.transform(X, verbose=False)
    return torch.LongTensor(terms_ids), torch.LongTensor(docs_offsets), torch.LongTensor(y)

In [11]:
class Mask(nn.Module):
    def __init__(self, negative_slope=1000, kappa=2.):
        super(Mask, self).__init__()
        self.negative_slope = negative_slope
        self.kappa = kappa
        self.sig = nn.Sigmoid()
    def forward(self, h):
        w = F.leaky_relu( h, negative_slope=self.negative_slope)
        w = self.sig(w-self.kappa)
        return w

In [12]:
help(torch.embedding_bag)

Help on built-in function embedding_bag:

embedding_bag(...)



In [13]:
class SimpleAttentionBag(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, drop=.5, initrange=.5, negative_slope=99.):
        super(SimpleAttentionBag, self).__init__()
        self.hiddens    = hiddens
        self.dt_emb     = nn.Embedding(vocab_size, hiddens)
        self.tt_emb     = nn.Embedding(vocab_size, hiddens)
        self.tt_dir_map = nn.Linear(hiddens, hiddens)
        self.fc         = nn.Linear(hiddens, nclass)
        self.initrange  = initrange 
        self.negative_slope = negative_slope
        self.drop       = nn.Dropout(drop)
        self.drop_      = drop
        self.sig        = nn.Sigmoid()
        self.init_weights()
    def forward(self, terms_idx, docs_offsets, return_mask=False):
        n = terms_idx.shape[0]
        batch_size = docs_offsets.shape[0]
        
        k         = [ terms_idx[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
        k.append( terms_idx[ docs_offsets[-1]: ] )
        x_packed  = pad_sequence(k, batch_first=True, padding_value=0)

        bx_packed = x_packed == 0
        doc_sizes = bx_packed.logical_not().sum(dim=1).view(batch_size, 1)
        pad_mask  = bx_packed.logical_not()
        pad_mask  = pad_mask.view(*bx_packed.shape, 1)
        pad_mask  = pad_mask.logical_and(pad_mask.transpose(1, 2))
        
        dt_h     = self.dt_emb( x_packed )
        dt_h     = F.dropout( dt_h, p=self.drop_, training=self.training )
        #dt_h     = self.drop(dt_h)
        
        tt_h     = self.tt_emb( x_packed )
        tt_h     = F.dropout( tt_h, p=self.drop_, training=self.training )
        #tt_h     = self.drop(tt_h)
        dir_tt_h = self.tt_dir_map( tt_h )

        weights = torch.bmm( tt_h, dir_tt_h.transpose( 1, 2 ) )
        weights = F.leaky_relu( weights, negative_slope=self.negative_slope)
        
        weights[pad_mask.logical_not()] = float('-inf') # Set the 3D-pad mask values to -inf (=0 in softmax)
        weights = F.sigmoid(weights)
        #weights = F.softmax(weights, dim=2) # Normalize the neighbors weights
        weights = torch.where(torch.isnan(weights), torch.zeros_like(weights), weights) #replace nan to zero
        weights = weights.sum(axis=2) / doc_sizes
        weights[bx_packed] = float('-inf') # Set the 2D-pad mask values to -inf  (=0 in softmax)
        weights = F.softmax(weights, dim=1)
        #weights = F.sigmoid(weights)
        weights = weights.view( *weights.shape, 1 )
        
        docs_h = dt_h * weights
        docs_h = docs_h.sum(axis=1)
        docs_h = F.dropout( docs_h, p=self.drop_, training=self.training )
        #docs_h = self.drop(docs_h)
        return self.fc(docs_h), weights
    
    def init_weights(self):
        self.dt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_dir_map.weight.data.uniform_(-self.initrange, self.initrange)
        self.fc.weight.data.uniform_(-self.initrange, self.initrange)

In [20]:
nepochs = 1000
max_epochs = 50
drop=0.5
device = torch.device('cuda:0')
batch_size = 8

In [21]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f9b6c03d3b0>

In [22]:
#sc = SimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout=drop).to( device )
ab = SimpleAttentionBag(tokenizer.vocab_size, 300, tokenizer.n_class, drop=drop).to( device )

optimizer = optim.AdamW( ab.parameters(), lr=6e-3, weight_decay=5e-3)
loss_func_cel = nn.CrossEntropyLoss().to( device )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.95,
                                                       patience=10, verbose=True)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=.98, verbose=True)

In [23]:
best = 99999.
counter = 1
eps=0.6
old_loss_train = 1.
dl_val = DataLoader(list(zip(fold.X_val, y_val)), batch_size=batch_size,
                         shuffle=False, collate_fn=collate_train, num_workers=12)
for e in tqdm(range(nepochs), total=nepochs):
    dl_train = DataLoader(list(zip(fold.X_train, y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate_train, num_workers=12)
    loss_train  = 0.
    with tqdm(total=len(y_train)+len(y_val), smoothing=0., desc=f"Epoch {e+1}") as pbar:
        total = 0
        correct  = 0
        ab.train()
        for i, (terms_idx, docs_offsets, y) in enumerate(dl_train):
            terms_idx    = terms_idx.to( device )
            docs_offsets = docs_offsets.to( device )
            y            = y.to( device )
            
            pred_docs,_ = ab( terms_idx, docs_offsets)
            pred_docs = F.softmax(pred_docs)
            loss = loss_func_cel(pred_docs, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
            total      += len(y)
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            
            toprint  = f"Train loss: {loss_train/(i+1):.5}/{loss.item():.5} "
            toprint += f'ACC: {correct/total:.5}'
            
            print(toprint, end=f"{' '*100}\r")
            
            pbar.update( len(y) )
            del pred_docs, loss
            del terms_idx, docs_offsets, y
            del y_pred
        loss_train = loss_train/(i+1)
        total = 0
        correct  = 0
        ab.eval()
        with torch.no_grad():
            loss_val = 0.
            print()
            for i, (terms_idx, docs_offsets, y) in enumerate(dl_val):
                terms_idx    = terms_idx.to( device )
                docs_offsets = docs_offsets.to( device )
                y            = y.to( device )

                pred_docs,weights = ab( terms_idx, docs_offsets )
                pred_docs   = F.softmax(pred_docs)

                y_pred      = pred_docs.argmax(axis=1)
                correct    += (y_pred == y).sum().item()
                total      += len(y)
                loss2       = loss_func_cel(pred_docs, y)
                loss_val   += loss2

                print(f'Val loss: {loss_val.item()/(i+1):.5} ACC: {correct/total:.5}', end=f"{' '*100}\r")
   
                pbar.update( len(y) )

            del terms_idx, docs_offsets, y
            del y_pred
            
            loss_val   = loss_val/(i+1)
            
            old_loss_train = (1.-eps)*old_loss_train + eps*loss_train
            
            #prop_val   = loss_val.cpu() / (old_loss_train+optimizer.param_groups[0]['lr'])
            prop_val   = loss_val.cpu() / old_loss_train
            ## if loss_train < loss_val, prop_val > 1 --  Overfitting
            ## if loss_train > loss_val, prop_val < 1 -- Underfitting
            
            prop_learn = old_loss_train / loss_train
            ## if loss_train < old_loss_train, prop_learn > 1 -- Aprendendo
            ## if loss_train > old_loss_train, prop_learn < 1 -- Desaprendendo
            
            #prop = (2.*prop_val*prop_learn)/(prop_val+prop_learn)
            prop = np.sqrt(prop_val*prop_learn)
            print()
            scheduler.step(loss_val)
            if scheduler.num_bad_epochs == (scheduler.patience-1):
                ab.drop_ *= prop
                ab.drop_ = max(0.05, min(ab.drop_, 0.95))
                print(f"Set dropout to {ab.drop_:.4} (prop={prop_val:.4},{prop_learn:.4}={prop:.4})")
                
            #scheduler.step()

            if best-loss_val > 0.0001 :
                best = loss_val.item()
                counter = 1
                print(f'New Best Val loss: {best:.5}', end=f"{' '*100}\n")
                best_model = copy.deepcopy(ab).to('cpu')
            elif counter > max_epochs:
                print()
                print(f'Best Val loss: {best:.5}', end=f"{' '*100}\n")
                break
            else:
                counter += 1
            del pred_docs, loss2

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.5128/2.0151 ACC: 0.67786                                                                                                    
Val loss: 1.4275 ACC: 0.74484                                                                                                    
New Best Val loss: 1.4275                                                                                                    


Epoch 2:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.3513/1.1783 ACC: 0.82191                                                                                                    
Val loss: 1.4001 ACC: 0.77643                                                                                                    
New Best Val loss: 1.4001                                                                                                    


Epoch 3:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2875/1.1655 ACC: 0.88799                                                                                                    
Val loss: 1.3903 ACC: 0.78493                                                                                                    
New Best Val loss: 1.3903                                                                                                    


Epoch 4:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2602/1.2088 ACC: 0.90783                                                                                                    
Val loss: 1.398 ACC: 0.77157                                                                                                     


Epoch 5:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2425/1.1707 ACC: 0.92599                                                                                                    
Val loss: 1.3848 ACC: 0.79465                                                                                                    
New Best Val loss: 1.3848                                                                                                    


Epoch 6:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2241/1.1656 ACC: 0.94384                                                                                                    
Val loss: 1.3929 ACC: 0.77764                                                                                                    


Epoch 7:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2163/1.1655 ACC: 0.95025                                                                                                    
Val loss: 1.3892 ACC: 0.78493                                                                                                    


Epoch 8:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.2086/1.1655 ACC: 0.95864                                                                                                    
Val loss: 1.3969 ACC: 0.77035                                                                                                    


Epoch 9:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1989/1.1654 ACC: 0.96826                                                                                                    
Val loss: 1.3932 ACC: 0.77764                                                                                                    


Epoch 10:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1943/1.1654 ACC: 0.97192                                                                                                    
Val loss: 1.3995 ACC: 0.77157                                                                                                    


Epoch 11:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1915/1.1654 ACC: 0.97497                                                                                                    
Val loss: 1.3996 ACC: 0.76671                                                                                                    


Epoch 12:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1902/1.1654 ACC: 0.97543                                                                                                    
Val loss: 1.4038 ACC: 0.76306                                                                                                    


Epoch 13:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1901/1.1661 ACC: 0.97619                                                                                                    
Val loss: 1.4086 ACC: 0.75942                                                                                                    


Epoch 14:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1888/1.2061 ACC: 0.97711                                                                                                    
Val loss: 1.4084 ACC: 0.75942                                                                                                    
Set dropout to 0.5442 (prop=1.184,1.001=1.088)


Epoch 15:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1892/1.1654 ACC: 0.9765                                                                                                     
Val loss: 1.4075 ACC: 0.76306                                                                                                    


Epoch 16:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1887/1.1654 ACC: 0.97711                                                                                                    
Val loss: 1.4044 ACC: 0.76671                                                                                                    
Epoch    16: reducing learning rate of group 0 to 5.7000e-03.


Epoch 17:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1873/1.1654 ACC: 0.97848                                                                                                    
Val loss: 1.4035 ACC: 0.76549                                                                                                    


Epoch 18:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1876/1.1654 ACC: 0.97787                                                                                                    
Val loss: 1.4111 ACC: 0.75577                                                                                                    


Epoch 19:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1862/1.1654 ACC: 0.9794                                                                                                     
Val loss: 1.4069 ACC: 0.7582                                                                                                     


Epoch 20:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1866/1.1654 ACC: 0.97909                                                                                                    
Val loss: 1.4079 ACC: 0.75334                                                                                                    


Epoch 21:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1863/1.1654 ACC: 0.97909                                                                                                    
Val loss: 1.4032 ACC: 0.76549                                                                                                    


Epoch 22:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1856/1.1654 ACC: 0.98001                                                                                                    
Val loss: 1.4097 ACC: 0.75334                                                                                                    


Epoch 23:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1853/1.1654 ACC: 0.98047                                                                                                    
Val loss: 1.4066 ACC: 0.7582                                                                                                     


Epoch 24:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1852/1.1654 ACC: 0.98047                                                                                                    
Val loss: 1.4065 ACC: 0.75577                                                                                                    


Epoch 25:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1852/1.1654 ACC: 0.98047                                                                                                    
Val loss: 1.4028 ACC: 0.76549                                                                                                    
Set dropout to 0.5921 (prop=1.184,1.0=1.088)


Epoch 26:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1861/1.1654 ACC: 0.97955                                                                                                    
Val loss: 1.4056 ACC: 0.76063                                                                                                    


Epoch 27:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1845/1.1654 ACC: 0.98108                                                                                                    
Val loss: 1.4082 ACC: 0.75699                                                                                                    
Epoch    27: reducing learning rate of group 0 to 5.4150e-03.


Epoch 28:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1857/1.1654 ACC: 0.98001                                                                                                    
Val loss: 1.4054 ACC: 0.75334                                                                                                    


Epoch 29:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1837/1.1654 ACC: 0.98199                                                                                                    
Val loss: 1.4081 ACC: 0.75334                                                                                                    


Epoch 30:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.185/1.1654 ACC: 0.98031                                                                                                     
Val loss: 1.4117 ACC: 0.74605                                                                                                    


Epoch 31:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1835/1.1654 ACC: 0.98199                                                                                                    
Val loss: 1.4184 ACC: 0.73755                                                                                                    


Epoch 32:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1844/2.1654 ACC: 0.9823                                                                                                     
Val loss: 1.4085 ACC: 0.7497                                                                                                     


Epoch 33:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1835/1.1654 ACC: 0.98199                                                                                                    
Val loss: 1.4144 ACC: 0.74362                                                                                                    


Epoch 34:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1835/1.1654 ACC: 0.98199                                                                                                    
Val loss: 1.4121 ACC: 0.74848                                                                                                    


Epoch 35:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1826/1.1654 ACC: 0.98321                                                                                                    
Val loss: 1.4186 ACC: 0.74727                                                                                                    


Epoch 36:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1825/1.1654 ACC: 0.98306                                                                                                    
Val loss: 1.4157 ACC: 0.74605                                                                                                    
Set dropout to 0.6478 (prop=1.197,1.0=1.094)


Epoch 37:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1839/1.1654 ACC: 0.98184                                                                                                    
Val loss: 1.4127 ACC: 0.75334                                                                                                    


Epoch 38:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1838/1.1654 ACC: 0.98199                                                                                                    
Val loss: 1.4134 ACC: 0.75091                                                                                                    
Epoch    38: reducing learning rate of group 0 to 5.1442e-03.


Epoch 39:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1834/1.1654 ACC: 0.9823                                                                                                     
Val loss: 1.4151 ACC: 0.75213                                                                                                    


Epoch 40:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1835/1.1654 ACC: 0.98245                                                                                                    
Val loss: 1.4133 ACC: 0.75091                                                                                                    


Epoch 41:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1825/1.1654 ACC: 0.98306                                                                                                    
Val loss: 1.4158 ACC: 0.7497                                                                                                     


Epoch 42:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1829/1.1654 ACC: 0.98291                                                                                                    
Val loss: 1.4218 ACC: 0.74362                                                                                                    


Epoch 43:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1822/1.1654 ACC: 0.98337                                                                                                    
Val loss: 1.4213 ACC: 0.74241                                                                                                    


Epoch 44:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1829/1.1655 ACC: 0.9826                                                                                                     
Val loss: 1.418 ACC: 0.7497                                                                                                      


Epoch 45:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1816/1.1654 ACC: 0.98382                                                                                                    
Val loss: 1.4167 ACC: 0.75456                                                                                                    


Epoch 46:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1821/1.1654 ACC: 0.98352                                                                                                    
Val loss: 1.421 ACC: 0.7497                                                                                                      


Epoch 47:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1816/1.1654 ACC: 0.98413                                                                                                    
Val loss: 1.4267 ACC: 0.73998                                                                                                    
Set dropout to 0.7118 (prop=1.207,1.0=1.099)


Epoch 48:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1856/1.1654 ACC: 0.98016                                                                                                    
Val loss: 1.4214 ACC: 0.7497                                                                                                     


Epoch 49:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1844/1.1654 ACC: 0.98215                                                                                                    
Val loss: 1.423 ACC: 0.7497                                                                                                      
Epoch    49: reducing learning rate of group 0 to 4.8870e-03.


Epoch 50:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1841/1.1655 ACC: 0.98199                                                                                                    
Val loss: 1.4248 ACC: 0.74362                                                                                                    


Epoch 51:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1851/1.1654 ACC: 0.98062                                                                                                    
Val loss: 1.4254 ACC: 0.73998                                                                                                    


Epoch 52:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1879/1.1754 ACC: 0.97787                                                                                                    
Val loss: 1.4248 ACC: 0.73755                                                                                                    


Epoch 53:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1848/1.1654 ACC: 0.98123                                                                                                    
Val loss: 1.43 ACC: 0.73512                                                                                                      


Epoch 54:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1855/1.1654 ACC: 0.98108                                                                                                    
Val loss: 1.4313 ACC: 0.73269                                                                                                    


Epoch 55:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.1865/1.1654 ACC: 0.9794                                                                                                     
Val loss: 1.4273 ACC: 0.73633                                                                                                    


Epoch 56:   0%|          | 0/7376 [00:00<?, ?it/s]

Train loss: 1.187/1.1654 ACC: 0.97894                                                                                                     
Val loss: 1.4315 ACC: 0.73269                                                                                                    

Best Val loss: 1.3848                                                                                                    


In [18]:
terms_idx.shape, docs_offsets.shape

NameError: name 'terms_idx' is not defined

In [19]:
ab = copy.deepcopy(best_model).to(device)
loss_total = 0
correct_t = 0
total_t = 0
dl_test = DataLoader(list(zip(fold.X_test, y_test)), batch_size=batch_size,
                         shuffle=False, collate_fn=collate_train, num_workers=2)
for i, (terms_idx_t, docs_offsets_t, y_t) in enumerate(dl_test):
    terms_idx_t    = terms_idx_t.to( device )
    docs_offsets_t = docs_offsets_t.to( device )
    y_t            = y_t.to( device )

    pred_docs_t,weigths = ab( terms_idx_t, docs_offsets_t )
    pred_docs_t = F.softmax(pred_docs_t)

    y_pred_t    = pred_docs_t.argmax(axis=1)
    correct_t  += (y_pred_t == y_t).sum().item()
    total_t    += len(y_t)
    loss_total += loss_func_cel(pred_docs_t, y_t)

print(f'Test loss: {loss_total.item()/(i+1):.5} ACC: {correct_t/total_t:.5}', end=f"{' '*100}\r")

Test loss: 1.3754 ACC: 0.78979                                                                                                    

In [None]:
"""
acm ####################################################################################
Train loss: 1.6009/1.6089 ACC: 0.94886                                                                                                    
Val loss: 1.7718 ACC: 0.77475                                                                                                    
New Best Val loss: 1.7718                                                                                                    
Test loss: 1.7678 ACC: 0.78557  79.92

Train loss: 1.7209/1.6095 ACC: 0.82338                                                                                                    
Val loss: 1.7595 ACC: 0.78236                                                                                                    
New Best Val loss: 1.7595                                                                                             
Test loss: 1.7585 ACC: 0.78557                                                                                                    

20ng ####################################################################################
Train loss: 2.0907/2.0787 ACC: 0.98845                                                                                                    
Val loss: 2.1869 ACC: 0.90803                                                                                                    
New Best Val loss: 2.1869                                                                                                    
Test loss: 2.178 ACC: 0.91068   92.65

reut ####################################################################################
Train loss: 3.7735/3.5191 ACC: 0.74734                                                                                                    
Val loss: 3.8554 ACC: 0.6763                                                                                                    
New Best Val loss: 3.8554                                                                                                    
Test loss: 3.8493 ACC: 0.6837  72.67

Train loss: 3.7489/3.8717 ACC: 0.77265                                                                                                    
Val loss: 3.8084 ACC: 0.71037                                                                                                    
Adjusting learning rate of group 0 to 5.1046e-03.
New Best Val loss: 3.8084 

webkb ####################################################################################
Train loss: 1.2228/1.2037 ACC: 0.9504                                                                                                     
Val loss: 1.3787 ACC: 0.80316                                                                                                    
New Best Val loss: 1.3787                                                                                                    
Test loss: 1.3857 ACC: 0.78858   81.53

"""

In [None]:
class AttentionBag(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, drop=.5, initrange=.5):
        super(AttentionBag, self).__init__()
        self.hiddens    = hiddens
        self.mask       = Mask()
        self.dt_emb     = nn.Embedding(vocab_size, hiddens)
        self.dt_dir_map = nn.Linear(hiddens, hiddens)
        self.drop       = nn.Dropout(drop)
        self.ma_term    = nn.MultiheadAttention(hiddens, 1)
        self.fc         = nn.Linear(hiddens, nclass)
        self.initrange  = initrange 
        self.init_weights()
    def forward(self, terms_idx, docs_offsets):
        n = terms_idx.shape[0]
        batch_size = docs_offsets.shape[0]
        
        k         = [ terms_idx[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
        k.append( terms_idx[ docs_offsets[-1]: ] )
        x_packed  = pad_sequence(k, batch_first=True, padding_value=0)

        bx_packed = x_packed == 0
        pad_mask  = bx_packed.logical_not()
        pad_mask  = pad_mask.view(*bx_packed.shape, 1)
        pad_mask  = pad_mask.logical_and(pad_mask.transpose(1, 2))
        
        dt_h      = self.dt_emb( x_packed )
        dt_h      = self.drop(dt_h)
        dir_dt_h  = self.dt_dir_map( dt_h )

        weights = torch.bmm( dt_h, dir_dt_h.transpose( 1, 2 ) )
        weights = self.mask(weights)
        
        weights_disc = (weights * pad_mask)
        weights_disc = weights_disc.sum(axis=1)
        weights_disc = F.softmax(weights_disc, dim=1)
        weights_disc = weights_disc.view( *weights_disc.shape, 1 )
        
        attn_mask = weights != 0
        attn_mask = attn_mask.logical_and( pad_mask ).logical_not()
        
        dt_h     = dt_h.transpose(0,1)
        dir_dt_h = dir_dt_h.transpose(0,1)
        docs_att, weigths_att = self.ma_term( dt_h, dir_dt_h, dt_h,
                                  key_padding_mask=bx_packed, 
                                  attn_mask=attn_mask )

        weigths_att = torch.where(torch.isnan(weigths_att), torch.zeros_like(weigths_att), weigths_att)
        weigths_att = (weigths_att * pad_mask)
        weigths_att = weigths_att.sum(axis=1)
        weigths_att = F.softmax(weigths_att, dim=1)
        weigths_att = weigths_att.view( *weigths_att.shape, 1 )
        
        weigths = weights_disc + weigths_att

        docs_att = docs_att.transpose(0,1)
        docs_att = torch.where(torch.isnan(docs_att), torch.zeros_like(docs_att), docs_att)
        
        docs_h = docs_att * weigths
        docs_h = docs_h.sum(axis=1)
        docs_h = docs_h / bx_packed.logical_not().sum(dim=1).view(batch_size, 1)
        docs_h = torch.where(torch.isnan(docs_h), torch.zeros_like(docs_h), docs_h)
        docs_h = self.drop(docs_h)
        return self.fc(docs_h)
    
    def init_weights(self):
        self.dt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.dt_dir_map.weight.data.uniform_(-self.initrange, self.initrange)
        self.fc.weight.data.uniform_(-self.initrange, self.initrange)
        self.ma_term.in_proj_weight.data.uniform_(-self.initrange, self.initrange)

In [None]:
y_pred_t      = pred_docs_t.argmax(axis=1)
correct_t     = (y_pred_t == y_t).sum().item()
total_t       = len(y_t)
correct_t/total_t

In [None]:
docs_offsets_t

In [None]:
terms_idx_t[:docs_offsets_t[batch_size]]

In [None]:
batch_off_test  = docs_offsets_t[batch_size]
batch_tidx_test = terms_idx_t[:batch_off_test]
h_terms_test    = sc.tt_emb( batch_tidx_test )
dirh_terms_test = sc.tt_dir_map( h_terms_test )

W = torch.matmul( h_terms_test, dirh_terms_test.T )
W = F.leaky_relu( W, negative_slope=sc.mask.negative_slope)
W = F.sigmoid(W)
W

In [None]:
k = [ batch_tidx_test[ docs_offsets_t[i-1]:docs_offsets_t[i] ] for i in range(1, batch_size) ]
k.append( batch_tidx_test[ docs_offsets_t[batch_size-1]:docs_offsets_t[batch_size] ] )
x_packed = pad_sequence(k, batch_first=True, padding_value=0)
tt_emb = sc.tt_emb( x_packed )
len(k)

In [None]:
x_packed = pad_sequence(k, batch_first=True, padding_value=0)
tt_emb = sc.tt_emb( x_packed )

In [None]:
tt_emb.transpose(0,1)

In [None]:
a = [terms_idx, terms_idx]
torch.stack(a)

In [None]:
F.softmax(pred_docs_t).argmax(axis=1)

In [None]:
y_val

In [None]:
(y_t == F.softmax(pred_docs_t).argmax(axis=1)).sum().item()/y_t.shape[0]

In [None]:
terms_idx_t, docs_offsets_t

In [None]:
shifts = sc._get_shift_(docs_offsets_t, terms_idx_t.shape[0])

In [None]:
zipado = zip(docs_offsets_t, shifts)
#next(zipado)
start,size = next(zipado)

In [None]:
w = sc.tt_emb( terms_idx_t[start:start+size] )
w1 = sc.tt_dir_map( w )
w = torch.matmul( w, w1.T )
w = F.leaky_relu( w, negative_slope=sc.negative_slope)
w = F.sigmoid(w)
#w = F.tanh(w)
#w = F.relu(w)
#w = w.mean(axis=1)
#w = F.softmax(w)
w,w1

In [None]:
w.round().sum() / (w.shape[0]*w.shape[1])

In [None]:
inv_mapper = { v:k for (k,v) in tokenizer.node_mapper.items() }

In [None]:
terms_idx[start:start+size]

In [None]:
fold.X_val[0]

In [None]:
bla = w.mean(axis=1)
bla = F.softmax(bla)
#bla = bla/torch.clamp(bla.sum(), 0.0001)
bla

In [None]:
[ (i, tid.item(),inv_mapper[tid.item()], wei.item()) for i, (tid, wei) in enumerate(zip(terms_idx[start:start+size], bla)) ]

In [None]:
w

In [None]:
w.mean(axis=1)

In [None]:
w.shape

In [None]:
norm = nn.BatchNorm1d(num_features=1).to(device)

bla2 = norm(bla.view(-1, 1)).squeeze()
bla2 = F.sigmoid(bla2)
bla2

In [None]:
1

In [None]:
ma = nn.MultiheadAttention(300, 300).to(device)
ma

In [None]:
torch.__version__

In [None]:
a,b = w.shape
w_ = w.view(a,1,b)
w1_ = w1.view(a,1,b)

attn_output = ma(w_, w1_, w_, need_weights=False)

attn_output.view(a,b)
attn_output_weights.view(a,a)

In [None]:
attn_output.view(a,b).shape

In [None]:
attn_output_weights.view(a,a)

In [None]:
F.softmax(torch.Tensor([[0.0718, 0.0716, 0.0712, 0.0714, 0.0721, 0.0710, 0.0712, 0.0714, 0.0710,
         0.0719, 0.0711, 0.0712, 0.0718, 0.0714],
        [0.0722, 0.0709, 0.0710, 0.0709, 0.0728, 0.0723, 0.0709, 0.0709, 0.0719,
         0.0712, 0.0709, 0.0707, 0.0725, 0.0707],
        [0.0711, 0.0715, 0.0710, 0.0713, 0.0710, 0.0712, 0.0718, 0.0713, 0.0709,
         0.0725, 0.0716, 0.0719, 0.0710, 0.0719],
        [0.0721, 0.0717, 0.0714, 0.0713, 0.0717, 0.0714, 0.0711, 0.0710, 0.0713,
         0.0717, 0.0710, 0.0712, 0.0720, 0.0711],
        [0.0722, 0.0711, 0.0710, 0.0710, 0.0737, 0.0716, 0.0709, 0.0705, 0.0714,
         0.0719, 0.0707, 0.0707, 0.0731, 0.0702],
        [0.0714, 0.0716, 0.0708, 0.0711, 0.0718, 0.0713, 0.0712, 0.0717, 0.0712,
         0.0724, 0.0713, 0.0712, 0.0716, 0.0714],
        [0.0712, 0.0714, 0.0713, 0.0713, 0.0722, 0.0716, 0.0709, 0.0714, 0.0714,
         0.0717, 0.0713, 0.0713, 0.0716, 0.0713],
        [0.0716, 0.0707, 0.0712, 0.0714, 0.0717, 0.0715, 0.0714, 0.0715, 0.0712,
         0.0721, 0.0711, 0.0714, 0.0717, 0.0715],
        [0.0717, 0.0713, 0.0708, 0.0710, 0.0725, 0.0717, 0.0714, 0.0709, 0.0712,
         0.0712, 0.0712, 0.0717, 0.0720, 0.0714],
        [0.0709, 0.0712, 0.0713, 0.0712, 0.0713, 0.0713, 0.0714, 0.0719, 0.0712,
         0.0724, 0.0715, 0.0715, 0.0717, 0.0713],
        [0.0715, 0.0716, 0.0712, 0.0708, 0.0725, 0.0717, 0.0712, 0.0714, 0.0714,
         0.0717, 0.0713, 0.0706, 0.0718, 0.0712],
        [0.0726, 0.0711, 0.0713, 0.0706, 0.0737, 0.0721, 0.0709, 0.0707, 0.0714,
         0.0708, 0.0706, 0.0705, 0.0730, 0.0707],
        [0.0718, 0.0711, 0.0714, 0.0715, 0.0725, 0.0707, 0.0707, 0.0711, 0.0712,
         0.0728, 0.0711, 0.0713, 0.0719, 0.0709],
        [0.0712, 0.0716, 0.0713, 0.0712, 0.0717, 0.0707, 0.0708, 0.0721, 0.0709,
         0.0728, 0.0713, 0.0716, 0.0711, 0.0715]]).sum(axis=0))

In [None]:
class NotTooSimpleClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_l, nclass, dropout1=0.1, dropout2=0.1, negative_slope=99,
                 initrange = 0.5, scale_grad_by_freq=False, device='cuda:0'):
        super(NotTooSimpleClassifier, self).__init__()
        
        self.dt_emb = nn.Embedding(vocab_size, hidden_l, scale_grad_by_freq=scale_grad_by_freq)
        self.tt_emb = nn.Embedding(vocab_size, hidden_l, scale_grad_by_freq=scale_grad_by_freq)
        
        self.undirected_map = nn.Linear(hidden_l, hidden_l)
        
        self.fc = nn.Linear(hidden_l, nclass)
        self.drop1 = nn.Dropout(dropout1)
        self.drop2 = nn.Dropout(dropout2)
        
        self.norm = nn.BatchNorm1d(1)
        
        self.initrange = initrange
        self.nclass = nclass
        self.negative_slope = negative_slope
        
        self.init_weights()
        
        #self.labls_emb = nn.Embedding(graph_builder.n_class, 300)
    
    def forward(self, terms_idxs, docs_offsets):
        n = terms_idxs.shape[0]
        weights = []
        shifts = self._get_shift_(docs_offsets, n)
        
        terms_h1 = self.tt_emb(terms_idxs)
        terms_h1 = self.drop1(terms_h1)
        
        terms_h2 = self.undirected_map( terms_h1 )
        #terms_h2 = self.drop1( terms_h2 )
        for start,size in zip(docs_offsets, shifts):
            w  = terms_h1[start:start+size]
            w1 = terms_h2[start:start+size]
            w = torch.matmul( w, w1.T )
            w = F.leaky_relu( w, negative_slope=self.negative_slope)
            w = F.sigmoid(w-5.5)
            w = w.mean(axis=1)
            w = F.softmax(w)
            #w = w / torch.clamp(w.sum(), 0.0001)
            weights.append( w )
        
        weights = torch.cat(weights)
        #weights = self.norm(weights.view(-1, 1)).squeeze()
        #weights = F.sigmoid(weights)
        
        h_docs  = F.embedding_bag(self.dt_emb.weight, terms_idxs, docs_offsets, per_sample_weights=weights, mode='sum')
        h_docs = self.drop2( h_docs )
        pred_docs = self.fc( h_docs )
        return pred_docs

    def init_weights(self):
        self.dt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        
    def _get_shift_(self, offsets, lenght):
        shifts = offsets[1:] - offsets[:-1]
        last = torch.LongTensor([lenght - offsets[-1]]).to( offsets.device )
        return torch.cat([shifts, last])