In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from multiprocessing import Pool
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import networkx as nx
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords as stopwords_by_lang
import copy

from tqdm.notebook import tqdm

import re
from collections import Counter
import scipy.sparse as sp
import numpy as np

In [2]:
from TGA.utils import Dataset

In [3]:
dataset = Dataset('/home/Documents/datasets/20ng/')
fold = next(dataset.get_fold_instances(10, with_val=True))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val'), 15062)

In [4]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

replace_patterns = [
    ('<[^>]*>', ''),                                    # remove HTML tags
    ('(\D)\d\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D\D)\d\d\d\D\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedZipcodePlusFour \\2'),
    ('(\D)\d(\D)', '\\1ParsedOneDigit\\2'),
    ('(\D)\d\d(\D)', '\\1ParsedTwoDigits\\2'),
    ('(\D)\d\d\d(\D)', '\\1ParsedThreeDigits\\2'),
    ('(\D)\d\d\d\d(\D)', '\\1ParsedFourDigits\\2'),
    ('(\D)\d\d\d\d\d(\D)', '\\1ParsedFiveDigits\\2'),
    ('(\D)\d\d\d\d\d\d(\D)', '\\1ParsedSixDigits\\2'),
    ('\d+', 'ParsedDigits')
]

compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]

def generate_preprocessor(replace_patterns):
    compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]
    def preprocessor(text):
        for pattern, replace in compiled_replace_patterns:
            text = re.sub(pattern, replace, text)
        text = text.lower()
        return text
    return preprocessor

generated_patters=generate_preprocessor(replace_patterns)

def preprocessor(text):
    # For each pattern, replace it with the appropriate string
    for pattern, replace in compiled_replace_patterns:
        text = re.sub(pattern, replace, text)
    text = text.lower()
    return text

class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, mindf=2, lan='english', model='topk', k=500, verbose=False):
        super(Tokenizer, self).__init__()
        self.mindf = mindf
        self.le = LabelEncoder()
        self.verbose = verbose
        self.stopwordsSet = stop_words
        self.model =  model
        self.k     = k
        self.lan = lan
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor, min_df=mindf)#.build_analyzer()
        self.local_analyzer = self.analyzer.build_analyzer()
        self.analyzer.set_params( analyzer=self.local_analyzer )
        self.node_mapper      = {}
        
    def analyzer_doc(self, doc):
        return self.local_analyzer(doc)
    def fit(self, X, y):
        self.N = len(X)
        y = self.le.fit_transform( y )
        self.n_class = len(self.le.classes_)
        docs_in_terms = []
        
        with Pool(processes=18) as p:
            #docs = map(self.local_analyzer, X)
            for doc_in_terms in tqdm(p.imap(self.analyzer_doc, X), total=self.N, disable=not self.verbose):
                doc_in_terms = list(set(map( self._filter_fit_, list(doc_in_terms) ))) 
                docs_in_terms.extend(doc_in_terms)
        
        self.term_freqs       = Counter(docs_in_terms)
        self.term_freqs       = { term:v for (term,v) in self.term_freqs.items() if v >= self.mindf }
        self.node_mapper      = { term: self.node_mapper.setdefault(term, len(self.node_mapper)+1)
                                 for term in self.term_freqs.keys() }
        self.node_mapper['<BLANK>'] = 0
        self.term_freqs['<BLANK>']  = self.N
        
        self.node_mapper['<UNK>']   = len(self.node_mapper)
        self.term_freqs['<UNK>']  = self.N
        self.vocab_size = len(self.node_mapper)
        
        self.term_array = [ term for (term,term_id) in sorted(self.node_mapper.items(), key=lambda x: x[1]) ]
        
        self.fi_ = np.array([ np.log2( (self.N+1)/(self.term_freqs[term]+1) ) for term in self.term_array ])
            
        return self
    def _get_idx_(self, term):
        return 
    def _filter_transform_(self, term):
        if term in self.stopwordsSet:
            return '<STPW>'
        if term not in self.node_mapper:
            return '<UNK>'
        return term
    def _filter_fit_(self, term):
        if term in self.stopwordsSet:
            return '<STPW>'
        return term
    def _model_(self, doc):
        doc_counter = Counter(doc)
        doc = np.array(list(doc_counter.keys()))
        if len(doc) > self.k:
            weigths = np.array([ self.fi_[t] for t in doc ])
            weigths = softmax(weigths)
            if self.model == 'topk':
                doc = doc[(-weigths).argsort()[:self.k]]
            elif self.model == 'sample':
                doc = np.random.choice(doc, size=self.k, replace=False, p=weigths)
        return doc, np.array([ doc_counter[t] for t in doc ])
    def transform(self, X, verbose=None):
        verbose = verbose if verbose is not None else self.verbose
        n = len(X)
        doc_off = [0]
        terms_idx = []
        terms_frq = []
        for i,doc_in_terms in tqdm(enumerate(map(self.analyzer_doc, X)), total=n, disable=not verbose):
            doc_in_terms = map( self._filter_transform_, doc_in_terms )
            doc_in_terms = filter( lambda x: x != '<STPW>', doc_in_terms )
            doc_in_terms = [ self.node_mapper[tid] for tid in doc_in_terms ]
            doc_in_terms, freqs = self._model_(doc_in_terms)
            doc_off.append( len(doc_in_terms) )
            terms_idx.extend( doc_in_terms )
            terms_frq.extend( freqs )
        return np.array( terms_idx ), np.array( terms_frq ), np.array(doc_off)[:-1].cumsum()

In [5]:
tokenizer = Tokenizer(mindf=1, verbose=True)
tokenizer.fit(fold.X_train, fold.y_train)
tokenizer.vocab_size, tokenizer.N

  0%|          | 0/15062 [00:00<?, ?it/s]

(98702, 15062)

In [6]:
terms_idx, terms_frq, doc_offs = tokenizer.transform( fold.X_val )
terms_idx

  0%|          | 0/1892 [00:00<?, ?it/s]

array([  127, 34792,   172, ..., 10369,  5056,  7184])

In [7]:
y_train = tokenizer.le.transform( fold.y_train )
y_val   = tokenizer.le.transform( fold.y_val )
y_test  = tokenizer.le.transform( fold.y_test )

In [8]:
def collate_train(param):
    X, y = zip(*param)
    terms_ids, terms_frq, docs_offsets = tokenizer.transform(X, verbose=False)
    return torch.LongTensor(terms_ids), torch.LongTensor(terms_frq), torch.LongTensor(docs_offsets), torch.LongTensor(y)

In [9]:
class SimpleAttentionBagO(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, drop=.5, maxfreq=10, initrange=.5, negative_slope=99.):
        super(SimpleAttentionBag, self).__init__()
        self.hiddens        = hiddens
        self.maxfreq        = maxfreq
        self.dt_emb         = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.tt_s_emb       = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.tt_t_emb       = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.freq_emb       = nn.Embedding(maxfreq, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.fc             = nn.Linear(hiddens, nclass)
        self.initrange      = initrange 
        self.negative_slope = negative_slope
        self.drop           = nn.Dropout(drop)
        self.norm           = nn.BatchNorm1d(hiddens)
        self.drop_          = drop
        self.sig            = nn.Sigmoid()
        self.init_weights()
    def forward(self, terms_idx, docs_offsets, terms_frq=None):
        n = terms_idx.shape[0]
        batch_size = docs_offsets.shape[0]
        
        k         = [ terms_idx[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
        k.append( terms_idx[ docs_offsets[-1]: ] )
        x_packed  = pad_sequence(k, batch_first=True, padding_value=0)
        if terms_frq is not None:
            j           = [ terms_frq[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
            j.append( terms_frq[ docs_offsets[-1]: ] )
            terms_frq   = pad_sequence(j, batch_first=True, padding_value=0)
            terms_frq   = torch.clamp(terms_frq, 0, self.maxfreq-1)
            terms_frq_h = self.freq_emb(terms_frq)

        bx_packed = x_packed == 0
        doc_sizes = bx_packed.logical_not().sum(dim=1).view(batch_size, 1)
        pad_mask  = bx_packed.logical_not()
        pad_mask  = pad_mask.view(*bx_packed.shape, 1)
        pad_mask  = pad_mask.logical_and(pad_mask.transpose(1, 2))
        
        tt_h     = self.tt_s_emb( x_packed )
        tt_dir_h = self.tt_t_emb( x_packed ) + terms_frq_h
        
        dt_h     = tt_dir_h + terms_frq_h
        dt_h     = F.dropout( dt_h, p=self.drop_, training=self.training )
        
        tt_h = torch.tanh(tt_h)
        tt_h = F.dropout( tt_h, p=self.drop_, training=self.training )
        
        tt_dir_h = torch.tanh(tt_dir_h)
        tt_dir_h = F.dropout( tt_dir_h, p=self.drop_, training=self.training )
        
        co_weights = torch.bmm( tt_h, tt_dir_h.transpose( 1, 2 ) )
        co_weights = F.leaky_relu( co_weights, negative_slope=self.negative_slope)
        
        co_weights[pad_mask.logical_not()] = float('-inf') # Set the 3D-pad mask values to -inf (=0 in sigmoid)
        co_weights = F.sigmoid(co_weights)
        
        weights = co_weights.sum(axis=2) / doc_sizes
        weights[bx_packed] = float('-inf') # Set the 2D-pad mask values to -inf  (=0 in softmax)
        
        weights = torch.softmax(weights, dim=1)
        weights = torch.where(torch.isnan(weights), torch.zeros_like(weights), weights)
        weights = weights.view( *weights.shape, 1 )
        #if terms_frq is not None:
        #    terms_frq = terms_frq.view( *terms_frq.shape, 1 )
        #    weights = weights * terms_frq
        
        docs_h = dt_h * weights
        docs_h = docs_h.sum(axis=1)
        docs_h = F.dropout( docs_h, p=self.drop_, training=self.training )
        docs_h = self.fc(docs_h)
        return docs_h, weights, co_weights
    
    def init_weights(self):
        self.freq_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.dt_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_s_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.tt_t_emb.weight.data.uniform_(-self.initrange, self.initrange)

In [10]:
class SimpleAttentionBag(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, drop=.5, maxfreq=10, initrange=.5, negative_slope=99.):
        super(SimpleAttentionBag, self).__init__()
        self.hiddens        = hiddens
        self.maxfreq        = maxfreq
        self.term_query_emb = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.term_key_emb   = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.term_value_emb = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.freq_emb       = nn.Embedding(maxfreq, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.repr_          = nn.Linear(hiddens, hiddens)
        self.fc             = nn.Linear(hiddens, nclass)
        self.initrange      = initrange 
        self.negative_slope = negative_slope
        self.drop_          = drop
        self.init_weights()
    def forward(self, terms_idx, docs_offsets, terms_frq):
        n = terms_idx.shape[0]
        batch_size = docs_offsets.shape[0]
        
        k         = [ terms_idx[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
        k.append( terms_idx[ docs_offsets[-1]: ] )
        x_packed  = pad_sequence(k, batch_first=True, padding_value=0)
        
        j           = [ terms_frq[ docs_offsets[i-1]:docs_offsets[i] ] for i in range(1, batch_size) ]
        j.append( terms_frq[ docs_offsets[-1]: ] )
        
        terms_frq   = pad_sequence(j, batch_first=True, padding_value=0)
        terms_frq   = torch.clamp(terms_frq, 0, self.maxfreq)
        terms_frq_h = self.freq_emb(terms_frq)
        
        bx_packed = x_packed == 0
        doc_sizes = bx_packed.logical_not().sum(dim=1).view(batch_size, 1)
        
        pad_mask  = bx_packed.logical_not()
        
        pad_mask  = pad_mask.view(*bx_packed.shape, 1)
        
        pad_mask  = pad_mask.logical_and(pad_mask.transpose(1, 2))
        
        query_hidden = self.term_query_emb( x_packed )
        query_hidden = query_hidden + terms_frq_h
        query_hidden = torch.tanh(query_hidden)
        query_hidden = F.dropout( query_hidden, p=self.drop_, training=self.training )
        
        key_hidden   = self.term_key_emb( x_packed )
        key_hidden   = key_hidden + terms_frq_h
        key_hidden   = torch.tanh(key_hidden)
        key_hidden   = F.dropout( key_hidden, p=self.drop_, training=self.training )
        
        trmdc_hidden = self.term_value_emb( x_packed )
        trmdc_hidden = trmdc_hidden + terms_frq_h
        #trmdc_hidden = F.leaky_relu( trmdc_hidden, negative_slope=self.negative_slope)
        trmdc_hidden = F.dropout( trmdc_hidden, p=self.drop_, training=self.training )
        
        co_weights = torch.bmm( key_hidden, query_hidden.transpose( 1, 2 ) )
        co_weights = F.leaky_relu( co_weights, negative_slope=self.negative_slope)
        
        co_weights[pad_mask.logical_not()] = float('-inf') # Set the 3D-pad mask values to -inf (=0 in sigmoid)
        co_weights = torch.sigmoid(co_weights)
        
        weights = co_weights.sum(axis=2) / doc_sizes
        weights[bx_packed] = float('-inf') # Set the 2D-pad mask values to -inf  (=0 in softmax)
        
        weights = torch.softmax(weights, dim=1)
        #weights = torch.sigmoid(weights)
        weights = torch.where(torch.isnan(weights), torch.zeros_like(weights), weights)
        weights = weights.view( *weights.shape, 1 )
        
        docs_h = trmdc_hidden * weights
        docs_h = docs_h.sum(axis=1)
        docs_h = F.dropout( docs_h, p=self.drop_, training=self.training )
        docs_h = self.fc(docs_h)
        return docs_h, weights, co_weights
    
    def init_weights(self):
        self.term_query_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.term_key_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.freq_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.repr_.weight.data.uniform_(-self.initrange, self.initrange)

In [11]:
nepochs = 1000
max_epochs = 30
drop=0.7
max_drop=0.75
device = torch.device('cuda:0')
batch_size = 64
k = 32

In [12]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fa7bcf87070>

In [13]:
torch.device('cuda:0')

device(type='cuda', index=0)

In [14]:
#sc = SimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout=drop).to( device )
ab = SimpleAttentionBag(tokenizer.vocab_size, 300, tokenizer.n_class, drop=drop).to( device )
#ab = AttentionBag(tokenizer.vocab_size, 300, tokenizer.n_class, drop=drop).to( device )
#ab = NotTooSimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout1=drop, dropout2=drop).to( device )
tokenizer.k = k
optimizer = optim.AdamW( ab.parameters(), lr=5e-3, weight_decay=5e-3)
loss_func_cel = nn.CrossEntropyLoss().to( device )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.95,
                                                       patience=10, verbose=True)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=.98, verbose=True)

In [15]:
num_workers=64

In [16]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [18]:
best = 99999.
counter = 1
loss_val = 1.
eps = .9
dl_val = DataLoader(list(zip(fold.X_val, y_val)), batch_size=batch_size,
                         shuffle=False, collate_fn=collate_train, num_workers=num_workers)
for e in tqdm(range(nepochs), total=nepochs):
    dl_train = DataLoader(list(zip(fold.X_train, y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate_train, num_workers=num_workers)
    loss_train  = 0.
    with tqdm(total=len(y_train)+len(y_val), smoothing=0., desc=f"Epoch {e+1}") as pbar:
        total = 0
        correct  = 0
        ab.train()
        tokenizer.model = 'sample'
        tokenizer.k = k
        for i, (terms_idx, terms_frq, docs_offsets, y) in enumerate(dl_train):
            terms_idx    = terms_idx.to( device )
            docs_offsets = docs_offsets.to( device )
            terms_frq = terms_frq.to( device )
            y            = y.to( device )
            
            pred_docs,_,_ = ab( terms_idx, docs_offsets, terms_frq)
            pred_docs     = torch.softmax(pred_docs, dim=1)
            loss          = loss_func_cel(pred_docs, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
            total      += len(y)
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            #ab.drop_ =  np.power((correct/total),loss_val)
            #ab.drop_ =  np.power((correct/total),4)
            ab.drop_ =  (correct/total)*max_drop
            
            toprint  = f"Train loss: {loss_train/(i+1):.5}/{loss.item():.5} "
            toprint += f'Drop: {ab.drop_:.5} '
            toprint += f'ACC: {correct/total:.5} '
            
            print(toprint, end=f"{' '*100}\r")
            
            pbar.update( len(y) )
            del pred_docs, loss
            del terms_idx, docs_offsets, y
            del y_pred
        loss_train = loss_train/(i+1)
        print()
        #print(ab.drop_)
        total = 0
        correct  = 0
        ab.eval()
        tokenizer.model = 'topk'
        tokenizer.k = 512
        with torch.no_grad():
            loss_val = 0.
            for i, (terms_idx, terms_frq, docs_offsets, y) in enumerate(dl_val):
                terms_idx    = terms_idx.to( device )
                docs_offsets = docs_offsets.to( device )
                y            = y.to( device )
                terms_frq = terms_frq.to( device )

                pred_docs, weights, co_weights = ab( terms_idx, docs_offsets, terms_frq)
                pred_docs   = torch.softmax(pred_docs, dim=1)

                y_pred      = pred_docs.argmax(axis=1)
                correct    += (y_pred == y).sum().item()
                total      += len(y)
                loss2       = loss_func_cel(pred_docs, y)
                loss_val   += loss2

                print(f'Val loss: {loss_val.item()/(i+1):.5} ACC: {correct/total:.5}', end=f"{' '*100}\r")
   
                pbar.update( len(y) )
            print()

            del terms_idx, docs_offsets, y
            del y_pred
            
            loss_val   = (loss_val/(i+1)).cpu()
            scheduler.step(loss_val)

            if best-loss_val > 0.0001 :
                best = loss_val.item()
                counter = 1
                print(f'New Best Val loss: {best:.5}', end=f"{' '*100}\n")
                best_model = copy.deepcopy(ab).to('cpu')
            elif counter > max_epochs:
                print(f'Best Val loss: {best:.5}', end=f"{' '*100}\n")
                break
            else:
                counter += 1
            del pred_docs, loss2

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/16954 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered

In [None]:
device_test = 'cpu'
ab = copy.deepcopy(best_model).to(device_test)
ab.eval()
loss_total = 0
correct_t = 0
total_t = 0
dl_test = DataLoader(list(zip(fold.X_test, y_test)), batch_size=128,
                         shuffle=False, collate_fn=collate_train, num_workers=num_workers)
tokenizer.k = 512
for i, (terms_idx_t, terms_frq_t, docs_offsets_t, y_t) in enumerate(dl_test):
    terms_idx_t    = terms_idx_t.to( device_test )
    docs_offsets_t = docs_offsets_t.to( device_test )
    terms_frq_t = terms_frq_t.to( device_test )
    y_t            = y_t.to( device_test )

    pred_docs_t,weigths,coweights = ab( terms_idx_t, docs_offsets_t, terms_frq_t )
    sofmax_docs_t = torch.softmax(pred_docs_t, dim=1)

    y_pred_t    = sofmax_docs_t.argmax(axis=1)
    correct_t  += (y_pred_t == y_t).sum().item()
    total_t    += len(y_t)
    loss_total += loss_func_cel(sofmax_docs_t, y_t)

    print(f'Test loss: {loss_total.item()/(i+1):.5} ACC: {correct_t/total_t:.5}', end=f"{' '*100}\r")

In [None]:
j           = [ terms_frq_t[ docs_offsets_t[i-1]:docs_offsets_t[i] ] for i in range(1, docs_offsets_t.shape[0]) ]
j.append( terms_frq_t[ docs_offsets_t[-1]: ] )
terms_frq   = pad_sequence(j, batch_first=True, padding_value=0)
terms_frq   = torch.clamp(terms_frq, 0, ab.maxfreq-1)
terms_frq_h = ab.freq_emb(terms_frq)

In [None]:
terms_frq_h.shape

In [None]:
terms_frq_h.transpose(0,2).shape

In [None]:
torch.cat([terms_frq_h.transpose(0,2), terms_frq_h.transpose(0,2)]).transpose(0,2).shape