In [1]:
import warnings
warnings.filterwarnings('ignore')

from TGA.utils import Dataset

from tqdm.notebook import tqdm
from TGA.utils import preprocessor

from time import time
import numpy as np
from itertools import repeat
from collections import Counter
from segtok import tokenizer as tk

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
dataset = Dataset('/home/Documentos/datasets/classification/datasets/acm/')
fold = next(dataset.get_fold_instances(10, with_val=True))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val'), 19907)

In [4]:
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, mindf=2, stopwords='remove', model='list', lan='english', verbose=False):
        super(Tokenizer, self).__init__()
        self.mindf = mindf
        self.le = LabelEncoder()
        self.verbose = verbose
        self.stopwords = stopwords
        self.stopwordsSet = set(stop_words.ENGLISH_STOP_WORDS)
        self.lan = lan
        self.model = model
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor).build_analyzer()
        #self.analyzer = tk.web_tokenizer
    
    def fit(self, X, y):
        self.N = len(X)
        self.le.fit( y )
        self.n_class = len(self.le.classes_)

        self.term_freqs = Counter()
        docs = map(self.analyzer, X)
        for doc_in_terms in tqdm(docs, total=self.N, disable=not self.verbose):
            doc_in_terms = list(map( self._filter_fit_, doc_in_terms ))
            self.term_freqs.update(list(set(doc_in_terms)))
        self.node_mapper      = {}
        self.term_freqs       = { term:v for (term,v) in self.term_freqs.items() if v >= self.mindf }    
        self.node_mapper      = { term:self._get_idx_(term) for term in self.term_freqs.keys() if self._isrel_(term) }
        self.node_mapper['<UNK>'] = len(self.node_mapper)
        self.vocab_size = len(self.node_mapper)
        
        return self
    def _isrel_(self, term):
        if self.stopwords == 'remove' and term in self.stopwordsSet:
            return False
        # put here your filter_functions
        return True
    def _get_idx_(self, term):
        # put here your idx_set_functions
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            print('is stop', term)
            return self.node_mapper.setdefault('<STPW>', len(self.node_mapper))
        return self.node_mapper.setdefault(term, len(self.node_mapper))
    def _filter_transform_(self, term):
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            return '<STPW>'
        if term not in self.node_mapper:
            return '<UNK>'
        return term
    def _filter_fit_(self, term):
        if self.stopwords == 'mark' and term in self.stopwordsSet:
            return '<STPW>'
        return term
    def _model_(self, doc):
        if self.model == 'set':
            return set(doc)
        return list(doc)
    def transform(self, X, verbose=None):
        verbose = verbose if verbose is not None else self.verbose
        n = len(X)
        doc_off = [0]
        terms_idx = []
        for i,doc_in_terms in tqdm(enumerate(map(self.analyzer, X)), total=n, disable=not verbose):
            doc_in_terms = filter( self._isrel_, doc_in_terms )
            doc_in_terms = map( self._filter_transform_, doc_in_terms )
            doc_in_terms = self._model_(doc_in_terms)
            doc_in_terms = [ self.node_mapper[tid] for tid in doc_in_terms ]
            if self.model == 'sorted':
                doc_in_terms = sorted(doc_in_terms)
            doc_off.append( len(doc_in_terms) )
            terms_idx.extend( doc_in_terms )
        return np.array( terms_idx ), np.array(doc_off)[:-1].cumsum()

In [5]:
tokenizer = Tokenizer(mindf=1, stopwords='keep', model='set', verbose=True)
tokenizer.fit(fold.X_train, fold.y_train)

HBox(children=(FloatProgress(value=0.0, max=19907.0), HTML(value='')))




Tokenizer(mindf=1, model='set', stopwords='keep', verbose=True)

In [6]:
y_train = tokenizer.le.transform( fold.y_train )
y_val   = tokenizer.le.transform( fold.y_val )

In [7]:
tokenizer.transform(fold.X_val[:2])

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




(array([   32, 12218,   194,  4523,   641,  1734,  2965, 49334,   240,
         1635,    32,   382,   957,   183,   511,  2152,    14,   148,
           34,   635,  2661,  2069,    38,    39,   394,  3046,  3075,
          965,   810,   120,  1383,    18,  2816,  3110,  7313,    19,
           47,   239,    50,  7125,   327,   131,    62,  7654,  1409,
           64,  5053,    24,   300,   104,  2459, 11006, 49334,   344,
          476,  8639, 11922,   305,   859, 15091,  4143,    84]),
 array([ 0, 10]))

In [8]:
tokenizer.node_mapper['<UNK>'], fold.X_val[0]

(49334,
 'how do computer science lecturers create modules? (poster) john traxler  \n')

In [9]:
def collate_train(param):
    X, y = zip(*param)
    terms_ids, docs_offsets = tokenizer.transform(X, verbose=False)
    return torch.LongTensor(terms_ids), torch.LongTensor(docs_offsets), torch.LongTensor(y)

In [52]:
class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_l, nclass, dropout1=0.1, dropout2=0.1, initrange = 0.5, device='cuda:0'):
        super(SimpleClassifier, self).__init__()
        
        self.doc_terms_emb = nn.EmbeddingBag(vocab_size, hidden_l, mode='mean', scale_grad_by_freq=False)
        
        self.fc = nn.Linear(hidden_l, nclass)
        self.drop1 = nn.Dropout(dropout1)
        self.drop2 = nn.Dropout(dropout2)
        
        self.initrange = initrange
        self.nclass = nclass
        
        self.init_weights()
        
        #self.labls_emb = nn.Embedding(graph_builder.n_class, 300)
    
    def forward(self, terms_idxs, docs_offsets):
        h_docs = self.doc_terms_emb( terms_idxs, docs_offsets )
        h_docs = self.drop( h_docs )
        pred_docs = self.fc( h_docs )
        return pred_docs

    def init_weights(self):
        self.doc_terms_emb.weight.data.uniform_(-self.initrange, self.initrange)
        
class NotTooSimpleClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_l, nclass, dropout1=0.1, dropout2=0.1, negative_slope=99,
                 initrange = 0.5, scale_grad_by_freq=False, device='cuda:0'):
        super(NotTooSimpleClassifier, self).__init__()
        
        self.dt_emb = nn.Embedding(vocab_size, hidden_l, scale_grad_by_freq=scale_grad_by_freq)
        self.tt_emb = nn.Embedding(vocab_size, hidden_l, scale_grad_by_freq=scale_grad_by_freq)
        
        self.undirected_map = nn.Linear(hidden_l, hidden_l)
        
        self.fc = nn.Linear(hidden_l, nclass)
        self.drop1 = nn.Dropout(dropout1)
        self.drop2 = nn.Dropout(dropout2)
        
        self.norm = nn.BatchNorm1d(hidden_l)
        
        self.initrange = initrange
        self.nclass = nclass
        self.negative_slope = negative_slope
        
        self.init_weights()
        
        #self.labls_emb = nn.Embedding(graph_builder.n_class, 300)
    
    def forward(self, terms_idxs, docs_offsets):
        n = terms_idxs.shape[0]
        weights = []
        shifts = self._get_shift_(docs_offsets, n)
        
        terms_h1 = self.tt_emb(terms_idx)
        terms_h1 = self.drop1(terms_h1)
        
        terms_h2 = self.undirected_map( terms_h1 )
        terms_h2 = self.drop1( terms_h2 )
        for start,size in zip(docs_offsets, shifts):
            w  = terms_h1[start:start+size]
            w1 = terms_h2[start:start+size]
            w = torch.matmul( w, w1.T )
            w = F.leaky_relu( w, negative_slope=self.negative_slope)
            w = F.sigmoid(w)
            w = w.mean(axis=1)
            w = F.softmax(w)
            #w = w / torch.clamp(w.sum(), 0.0001)
            weights.append( w )
        
        weights = torch.cat(weights)
        
        h_docs  = F.embedding_bag(self.dt_emb.weight, terms_idxs, docs_offsets, per_sample_weights=weights, mode='sum')
        h_docs = self.drop2( h_docs )
        pred_docs = self.fc( h_docs )
        return pred_docs

    def init_weights(self):
        self.dt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        
    def _get_shift_(self, offsets, lenght):
        shifts = offsets[1:] - offsets[:-1]
        last = torch.LongTensor([lenght - offsets[-1]]).to( offsets.device )
        return torch.cat([shifts, last])

In [53]:
nepochs = 50
max_epochs = 5
drop1=0.8
drop2=0.8
device = torch.device('cuda:0')
batch_size = 32

In [54]:
#sc = SimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout=drop).to( device )
sc = NotTooSimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, negative_slope=1000,
                            dropout1=drop1, dropout2=drop2).to( device )

optimizer = optim.AdamW( sc.parameters(), lr=5e-3, weight_decay=5e-3)
loss_func_cel = nn.CrossEntropyLoss().to( device )
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [None]:
best = 99999.
counter = 1
for e in tqdm(range(nepochs), total=nepochs):
    dl_train = DataLoader(list(zip(fold.X_train, y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate_train, num_workers=5)
    dl_val = DataLoader(list(zip(fold.X_val, y_val)), batch_size=len(y_val),
                             shuffle=False, collate_fn=collate_train, num_workers=5)
    total_loss  = 0.
    with tqdm(total=len(y_train)+len(y_val), smoothing=0., desc=f"Epoch {e+1}") as pbar:
        total = 0
        correct  = 0
        sc.train()
        for i, (terms_idx, docs_offsets, y) in enumerate(dl_train):
            terms_idx    = terms_idx.to( device )
            docs_offsets = docs_offsets.to( device )
            y            = y.to( device )
            
            pred_docs = sc( terms_idx, docs_offsets)
            pred_docs = F.softmax(pred_docs)
            loss = loss_func_cel(pred_docs, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total      += len(y)
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            
            toprint  = f"Train loss: {total_loss/(i+1):.5}/{loss.item():.5} "
            toprint += f'ACC: {correct/total:.5}'
            
            print(toprint, end=f"{' '*100}\r")
            
            pbar.update( len(y) )
            
        scheduler.step()
        total = 0
        correct  = 0
        sc.eval()
        print()
        for i, (terms_idx, docs_offsets, y) in enumerate(dl_val):
            terms_idx    = terms_idx.to( device )
            docs_offsets = docs_offsets.to( device )
            y            = y.to( device )
            
            pred_docs = sc( terms_idx, docs_offsets )
            pred_docs = F.softmax(pred_docs)
            
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            total      += len(y)
            loss2 = loss_func_cel(pred_docs, y)
            
            print(f'Val loss: {loss2.item():.5} ACC: {correct/total:.5}', end=f"{' '*100}\r")
            
            pbar.update( len(y) )
        if best-loss2.item() > 0.001 :
            best = loss2.item()
            counter = 1
            print()
            print(f'New Best Val loss: {best:.5}', end=f"{' '*100}\n")
        elif counter > max_epochs:
            print()
            print(f'Best Val loss: {best:.5}', end=f"{' '*100}\n")
            break
        else:
            counter += 1

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=22402.0, style=ProgressStyle(description_wi…

Train loss: 2.1723/2.0135 ACC: 0.40167                                                                                                     

In [None]:
docs_offsets

In [None]:
"""
acm AdamW
Train loss: 1.671/1.826 ACC: 0.8789
Val loss: 1.775 ACC: 0.78
New Best Val loss: 1.775

20ng AdamW
Train loss: 2.094/2.078 ACC: 0.9844
Val loss: 2.186 ACC: 0.9017
New Best Val loss: 2.186

reut AdamW
Train loss: 3.747/3.852 ACC: 0.776                                                                                                     
Val loss: 3.821 ACC: 0.7096                                                                                                    
New Best Val loss: 3.821

webkb
Train loss: 1.2312/1.1705 ACC: 0.94262                                                                                                    
Val loss: 1.3784 ACC: 0.79587                                                                                                    
New Best Val loss: 1.3784  

"""

In [None]:
F.softmax(pred_docs).argmax(axis=1)

In [None]:
y_val

In [None]:
(y == F.softmax(pred_docs).argmax(axis=1)).sum().item()/y.shape[0]

In [None]:
terms_idx, docs_offsets

In [None]:
shifts = sc._get_shift_(docs_offsets, terms_idx.shape[0])

In [None]:
zipado = zip(docs_offsets, shifts)
next(zipado)
start,size = next(zipado)

In [None]:
w = sc.tt_emb( terms_idx[start:start+size] )
w1 = sc.undirected_map( w )
w = torch.matmul( w, w1.T )
w = F.leaky_relu( w, negative_slope=sc.negative_slope)
w = F.sigmoid(w-5.)
#w = F.tanh(w)
#w = F.relu(w)
#w = w.mean(axis=1)
#w = F.softmax(w)

In [None]:
inv_mapper = { v:k for (k,v) in tokenizer.node_mapper.items() }

In [None]:
terms_idx[start:start+size]

In [None]:
fold.X_val[1]

In [None]:
bla = w.mean(axis=1)
#bla = F.softmax(bla)
bla = bla/torch.clamp(bla.sum(), 0.0001)
bla

In [None]:
[ (i, tid.item(),inv_mapper[tid.item()], wei.item()) for i, (tid, wei) in enumerate(zip(terms_idx[start:start+size], bla)) ]

In [None]:
w

In [None]:
w.mean(axis=1)

In [None]:
w.shape