In [1]:
from TGA.tokenizer import Tokenizer
from TGA.utils import Dataset
from torch.nn import Transformer
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
tokenizer = Tokenizer(stopwordsSet=None, vocab_max_size=200000, k=256, ngram_range=(1,1))
dataset = Dataset('/home/mangaravite/Documentos/datasets/classification/datasets/acm/')
fold = next( dataset.get_fold_instances(10, with_val=True) )
tokenizer.fit( fold.X_train, fold.y_train )

  t = np.log2(fc/(c * f))
  t = np.log2(fnc/((1-c)*f))


Tokenizer(k=256, stopwordsSet=[], vocab_max_size=200000)

In [3]:
tidf,tfs,dfs = tokenizer.transform(fold.X_val)

In [4]:
def collate(X):
    tidf, tfs, dfs = tokenizer.transform(X)
    docs_tidf = pad_sequence(list(map(torch.LongTensor, tidf)), batch_first=True, padding_value=0)
    
    docs_tfs  = pad_sequence(list(map(torch.LongTensor, tfs)), batch_first=True, padding_value=0)
    docs_tfs = torch.LongTensor(torch.log2(docs_tfs+1).round().long())
    
    docs_dfs  = pad_sequence(list(map(torch.LongTensor, dfs)), batch_first=True, padding_value=0)
    docs_dfs = torch.LongTensor(torch.log2(docs_dfs+1).round().long())
    return docs_tidf, docs_tfs, docs_dfs
def collate_train(param):
    X, y = zip(*param)
    return collate(X), torch.LongTensor(y)
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2))
    return torch.cos(sim_mt)

In [5]:
from TGA.masked_batch_norm import MaskedBatchNorm1d

In [15]:
class TFIDFClassifier(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, maxF=20, drop=.3 ,
                 negative_slope=99., mask_layer=True):
        super(TFIDFClassifier, self).__init__()
        
        self.hiddens        = hiddens
        self.maxF           = maxF
        self.negative_slope = negative_slope
        self.vocab_size     = vocab_size
        self.drop_          = drop
        self.nclass         = nclass
        
        self.TF_emb         = nn.Embedding(maxF, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.DF_emb         = nn.Embedding(maxF, hiddens, scale_grad_by_freq=True, padding_idx=0)
        
        self.term_emb       = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.key_lin        = nn.Sequential(nn.Linear(hiddens, hiddens),
                                            nn.Tanh())
        self.query_lin      = nn.Sequential(nn.Linear(hiddens, hiddens),
                                            nn.Tanh())
        self.value_lin      = nn.Sequential(nn.Linear(hiddens, hiddens),
                                            nn.Tanh())
        
        self.fc             = nn.Sequential(nn.Linear(hiddens, hiddens),
                                            nn.ReLU(),
                                            nn.Linear(hiddens, nclass))
        
        
        self.latend_labels  = nn.Parameter( torch.rand(nclass, hiddens) )
        if mask_layer:
            self.mask_layers = self._build_mask(hiddens, nclass)
        else:
            self.mask_layers = 1.
        
        self.batchnorm = MaskedBatchNorm1d(1)
        self.gelu = nn.GELU()
        self.softmax = nn.Softmax(dim=1)
        
        self.init_weights()
    def _build_mask(self, hiddens, nclass):
        q = hiddens // nclass
        a = []
        list(map(a.extend, [ q*[vc+1] for vc in range(nclass) ]))
        a.extend((h-len(a))*[0])
        a = np.array(a)
        a = np.matrix([ a == (vc+1) for vc in range(nclass) ])
        a[:,-(hiddens-(nclass*q)):] = True
        
        return torch.Tensor(a)
    def forward(self, doc_tids, TFs, DFs, padding=None):
        
        if padding is None:
            padding = doc_tids == 0
            padding = padding.unsqueeze(2)
        
        TFs      = torch.clamp( TFs, max=self.maxF-1 )
        TFs_h    = self.TF_emb( TFs ) # [B, S, H]
        b_size, s_size, h_size = TFs_h.shape # Batch_size(B), Set_size(S), Hidden_size(H)
        doc_sizes = torch.logical_not(padding).sum(dim=1).view(b_size, 1)
        
        DFs      = torch.clamp( DFs, max=self.maxF-1 )
        DFs_h    = self.DF_emb(DFs) # [B, S, H]
        
        doc_tids = torch.clamp( doc_tids, max=self.vocab_size-1 )
        term_h   = self.term_emb(doc_tids)
        term_h   = term_h + TFs_h + DFs_h # [B, S, H]
        
        key_h      = self.key_lin(term_h)                  # [B, S, H]
        key_h   = key_h.reshape( b_size, 1, s_size, h_size )         # [B, 1, S, H]
        key_h   = F.dropout(key_h, self.drop_)
        
        query_h    = self.query_lin(term_h)                # [B, S, H]
        query_h = query_h.reshape( b_size, s_size, 1, h_size )       # [B, S, 1, H]
        query_h = F.dropout(query_h, self.drop_)
        
        coocc_h = (key_h + query_h) / 2
        
        value_h    = self.value_lin(term_h)                # [B, S, H]
        value_h    = F.dropout(value_h, self.drop_)
        
        
        padding2d = torch.logical_or(padding, padding.transpose(2,1))
        padding = padding2d.reshape(b_size, 1, s_size*s_size)
        padding = torch.logical_not(padding)
        
        # Depois posso tentar K-representações das Classes (L)
        L = self.latend_labels#.weight         # [L, H]
        L = L * self.mask_layers
        l_size, _ = L.shape
        L = L.unsqueeze(dim=0) # [1, L, H]
        L = L.repeat(b_size, 1, 1) #            [B, L, H]

        coocc_h = coocc_h.reshape( b_size, s_size*s_size, h_size  ) # [B, S*S, H]
        term_dom = sim_matrix( coocc_h, L ) # [B, S*S, L]
        coocc_h = coocc_h.reshape( b_size, s_size, s_size, h_size  ) # [B, S, S, H]
        
        p = term_dom.softmax(dim=2)                       # Convert to Distribution     [B, S*S, L]
        logp = torch.log(p)                               # Compute log
        cweight = 1./torch.sum(-p*logp, dim=2)                # Entropy of the Distribution [B, S*S]
        cweight = cweight.reshape(b_size, s_size, s_size)   # [ B, S, S ]
        cweight[padding2d] = -self.negative_slope          # zero padding # [ B, S, S ]
        cweight = cweight.reshape(b_size, 1, s_size*s_size) # [ B, 1, S*S ]
        
        cweight = self.batchnorm(cweight, input_mask=padding) # [ B, 1, S*S ]
        cweight = cweight.reshape(b_size, s_size, s_size) # [ B, S, S ]
        cweight = torch.exp(cweight)
        cweight = cweight / torch.clamp(cweight.sum(dim=1).unsqueeze(dim=1), min=0.00001)
        cweight[padding2d] = 0 # zero padding # [ B, S, S ]
        weight = cweight.sum(dim=2) # [ B, S ]
        weight = weight.unsqueeze(dim=2) # [ B, S, 1 ]
        
        value_h = (value_h * weight).sum(dim=1)# [ B, S, H ] Representação media dos documentos
        value_h = F.dropout(value_h, self.drop_)
        docs_pred = self.fc(value_h)
        
        return docs_pred, cweight # [ B, S ]
    
    def init_weights(self):
        
        return
        nn.init.xavier_normal_(self.DF_emb.weight.data)
        nn.init.xavier_normal_(self.TF_emb.weight.data)
        nn.init.xavier_normal_(self.key_emb.weight.data)
        nn.init.xavier_normal_(self.query_emb.weight.data)
        nn.init.xavier_normal_(self.fc.weight.data)

In [16]:
device = torch.device('cuda:0')
batch_size=16
num_workers=4
k = 128
nepochs = 100

In [17]:
tfidfc = TFIDFClassifier(vocab_size=tokenizer.vocab_size, hiddens=300, nclass=tokenizer.n_class).to( device )
tfidfc.mask_layers = tfidfc.mask_layers.to(device)

tokenizer.k = k
optimizer = optim.AdamW( tfidfc.parameters(), lr=5e-3, weight_decay=5e-3)
loss_func_cel = nn.CrossEntropyLoss().to( device )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.95,
                                                       patience=3, verbose=True)

TypeError: cannot assign 'torch.FloatTensor' as parameter 'latend_labels' (torch.nn.Parameter or None expected)

In [12]:
tfidfc._build_mask(300, 11).shape

torch.Size([11, 300])

In [13]:
dl_val = DataLoader(list(zip(fold.X_val, fold.y_val)), batch_size=batch_size,
                         shuffle=False, collate_fn=collate_train, num_workers=num_workers)
total=1
correct=1
for e in tqdm(range(nepochs), total=nepochs):
    dl_train = DataLoader(list(zip(fold.X_train, fold.y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate_train, num_workers=num_workers)
    loss_train  = 0.
    with tqdm(total=len(fold.y_train), smoothing=0., desc=f"Epoch {e+1}") as pbar:
        for i, (data, y) in enumerate(dl_train):
            doc_tids, TFs, DFs = data
            doc_tids = doc_tids.to( device )
            TFs      = TFs.to( device )
            DFs      = DFs.to( device )
            y        = y.to( device )
            
            pred_docs,weights = tfidfc( doc_tids, TFs, DFs )
            pred_docs = torch.softmax(pred_docs, dim=1)
            loss      = loss_func_cel(pred_docs, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
            total      += len(y)
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            
            toprint  = f"Train loss: {loss_train/(i+1):.5}/{loss.item():.5} "
            toprint += f'ACC: {correct/total:.5} '
            
            print(toprint, end=f"{' '*100}\r")
            pbar.update( len(y) )
            
            #break
            del doc_tids, TFs
            del DFs, y, pred_docs
            del loss, y_pred
            
    break

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/19907 [00:00<?, ?it/s]

Train loss: 2.2797/2.2097 ACC: 0.26306                                                                                                     

In [14]:
weights

tensor([[[0.0164, 0.0164, 0.0164,  ..., 0.0000, 0.0000, 0.0000],
         [0.0164, 0.0164, 0.0164,  ..., 0.0000, 0.0000, 0.0000],
         [0.0164, 0.0164, 0.0164,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116],
         [0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116],
         [0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116],
         ...,
         [0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116],
         [0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116],
         [0.0116, 0.0116, 0.0116,  ..., 0.0116, 0.0116, 0.0116]],

        [[0.0166, 0.0166, 0.0166,  ..., 0.0000, 0.0000, 0.0000],
         [0.0166, 0.0166, 0.0166,  ..., 0.0000, 0.0000, 0.0000],
         [0.0166, 0.0166, 0.0166,  ..., 0.0000, 0.0000, 0.

In [None]:
h = 300
c = 11
q = h // c
a = []
list(map(a.extend, [ q*[vc+1] for vc in range(c) ]))
a.extend((h-len(a))*[0])
a = np.array(a)
a = np.matrix([ a == (vc+1) for vc in range(c) ])
a[:,-(h-(c*q)):] = True
a.shape

In [None]:
-(h-(c*q))