In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from multiprocessing import Pool
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import networkx as nx
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords as stopwords_by_lang

import copy

from tqdm.notebook import tqdm

import re
from collections import Counter
import scipy.sparse as sp
import numpy as np

In [2]:
from TGA.utils import Dataset

In [3]:
dataset = Dataset('/home/Documents/datasets/acm/')
g = dataset.get_fold_instances(10, with_val=True)
fold = next(g)
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val'), 19907)

In [4]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

replace_patterns = [
    ('<[^>]*>', ''),                                    # remove HTML tags
    ('(\D)\d\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D\D)\d\d\d\D\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedZipcodePlusFour \\2'),
    ('(\D)\d(\D)', '\\1ParsedOneDigit\\2'),
    ('(\D)\d\d(\D)', '\\1ParsedTwoDigits\\2'),
    ('(\D)\d\d\d(\D)', '\\1ParsedThreeDigits\\2'),
    ('(\D)\d\d\d\d(\D)', '\\1ParsedFourDigits\\2'),
    ('(\D)\d\d\d\d\d(\D)', '\\1ParsedFiveDigits\\2'),
    ('(\D)\d\d\d\d\d\d(\D)', '\\1ParsedSixDigits\\2'),
    ('\d+', 'ParsedDigits')
]

compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]

def generate_preprocessor(replace_patterns):
    compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]
    def preprocessor(text):
        for pattern, replace in compiled_replace_patterns:
            text = re.sub(pattern, replace, text)
        text = text.lower()
        return text
    return preprocessor

generated_patters=generate_preprocessor(replace_patterns)

def preprocessor(text):
    # For each pattern, replace it with the appropriate string
    for pattern, replace in compiled_replace_patterns:
        text = re.sub(pattern, replace, text)
    text = text.lower()
    return text

class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, mindf=2, lan='english', stopwords='nltk', model='topk', k=500, verbose=False):
        super(Tokenizer, self).__init__()
        self.mindf = mindf
        self.le = LabelEncoder()
        self.verbose = verbose
        self.lan = lan
        if stopwords == 'nltk':
            self.stopwordsSet = stopwords_by_lang.words(lan)
        elif stopwords == 'scikit':
            self.stopwordsSet = stop_words
        else:
            self.stopwordsSet = []
        self.model =  model
        self.k     = k
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor, min_df=mindf)#.build_analyzer()
        self.local_analyzer = self.analyzer.build_analyzer()
        self.analyzer.set_params( analyzer=self.local_analyzer )
        self.node_mapper      = {}
        
    def analyzer_doc(self, doc):
        return self.local_analyzer(doc)
    def fit(self, X, y):
        self.N = len(X)
        y = self.le.fit_transform( y )
        self.n_class = len(self.le.classes_)
        docs_in_terms = []
        
        with Pool(processes=18) as p:
            #docs = map(self.local_analyzer, X)
            for doc_in_terms in tqdm(p.imap(self.analyzer_doc, X), total=self.N, disable=not self.verbose):
                doc_in_terms = list(set(map( self._filter_fit_, list(doc_in_terms) ))) 
                docs_in_terms.extend(doc_in_terms)
        
        self.term_freqs       = Counter(docs_in_terms)
        self.term_freqs       = { term:v for (term,v) in self.term_freqs.items() if v >= self.mindf }
        self.node_mapper      = { term: self.node_mapper.setdefault(term, len(self.node_mapper)+1)
                                 for term in self.term_freqs.keys() }
        self.node_mapper['<BLANK>'] = 0
        self.term_freqs['<BLANK>']  = self.N
        
        self.node_mapper['<UNK>']   = len(self.node_mapper)
        self.term_freqs['<UNK>']  = self.N
        self.vocab_size = len(self.node_mapper)
        
        self.term_array = [ term for (term,term_id) in sorted(self.node_mapper.items(), key=lambda x: x[1]) ]
        
        self.fi_ = np.array([ np.log2( (self.N+1)/(self.term_freqs[term]+1) ) for term in self.term_array ])
            
        return self
    def _filter_transform_(self, term):
        if term in self.stopwordsSet:
            return '<STPW>'
        if term not in self.node_mapper:
            return '<UNK>'
        return term
    def _filter_fit_(self, term):
        if term in self.stopwordsSet:
            return '<STPW>'
        return term
    def _model_(self, doc):
        doc_counter = Counter(doc)
        doc = np.array(list(doc_counter.keys()))
        if len(doc) > self.k:
            weigths = np.array([ self.fi_[t] for t in doc ])
            weigths = softmax(weigths)
            if self.model == 'topk':
                doc = doc[(-weigths).argsort()[:self.k]]
            elif self.model == 'sample':
                doc = np.random.choice(doc, size=self.k, replace=False, p=weigths)
        TFs = np.array([ doc_counter[tid] for tid in doc ])
        DFs = np.array([ self.term_freqs[self.term_array[tid]] for tid in doc ])
        return doc, TFs, DFs
    def transform(self, X, verbose=None):
        verbose = verbose if verbose is not None else self.verbose
        n = len(X)
        terms_ = []
        for i,doc_in_terms in tqdm(enumerate(map(self.analyzer_doc, X)), total=n, disable=not verbose):
            doc_in_terms = map( self._filter_transform_, doc_in_terms )
            #doc_in_terms = filter( lambda x: x != '<STPW>', doc_in_terms )
            doc_tids = [ self.node_mapper[tid] for tid in doc_in_terms ]
            doc_tids, TFs, DFs = self._model_(doc_tids)
            terms_.append( (doc_tids, TFs, DFs) )
        doc_tids, TFs, DFs = list(zip(*terms_))
        return list(doc_tids), list(TFs), list(DFs)

In [5]:
tokenizer = Tokenizer(mindf=1, verbose=True, stopwords=None)
tokenizer.fit(fold.X_train, fold.y_train)
tokenizer.vocab_size, tokenizer.N

  0%|          | 0/19907 [00:00<?, ?it/s]

(49336, 19907)

In [6]:
doc_tids, TFs, DFs =  tokenizer.transform( fold.X_val )

  0%|          | 0/2495 [00:00<?, ?it/s]

In [7]:
pad_sequence(list(map(torch.LongTensor, TFs)), batch_first=True, padding_value=0)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 4,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 3, 4,  ..., 0, 0, 0],
        [4, 3, 2,  ..., 0, 0, 0],
        [1, 1, 2,  ..., 0, 0, 0]])

In [8]:
tokenizer.term_array[15749]

'meting'

In [9]:
y_train = tokenizer.le.transform( fold.y_train )
y_val   = tokenizer.le.transform( fold.y_val )
y_test  = tokenizer.le.transform( fold.y_test )

In [10]:
class AttentionTFIDF_V1(nn.Module):
    def __init__(self, vocab_size, hiddens, nclass, maxF=20, drop=.5,
                 initrange=.5, negative_slope=99.):
        super(AttentionTFIDF_V1, self).__init__()
        self.hiddens        = hiddens
        self.maxF           = maxF
        self.value_emb      = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.query_emb      = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.key_emb        = nn.Embedding(vocab_size, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.TF_emb         = nn.Embedding(maxF, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.DF_emb         = nn.Embedding(maxF, hiddens, scale_grad_by_freq=True, padding_idx=0)
        self.fc             = nn.Linear(hiddens, nclass)
        self.initrange      = initrange 
        self.negative_slope = negative_slope
        self.drop_          = drop
        self.init_weights()
    def forward(self, doc_tids, TFs, DFs):
        batch_size = doc_tids.size(0)
        bx_packed  = doc_tids == 0
        pad_mask   = bx_packed.logical_not()
        doc_sizes  = pad_mask.sum(dim=1).view(batch_size, 1)
        pad_mask   = pad_mask.view(*bx_packed.shape, 1)
        pad_mask   = pad_mask.logical_and(pad_mask.transpose(1, 2))
        
        TFs     = torch.clamp( TFs, max=self.maxF-1 )
        h_TFs   = self.TF_emb( TFs )
        h_TFs   = F.dropout( h_TFs, p=self.drop_, training=self.training )
        
        DFs     = torch.clamp( DFs, max=self.maxF-1 )
        h_DFs   = self.DF_emb( DFs )
        h_DFs   = F.dropout( h_DFs, p=self.drop_, training=self.training )
        
        h_query = self.query_emb( doc_tids )
        h_query = h_query + h_TFs + h_DFs
        #h_query = torch.tanh( h_query )
        h_query = F.dropout( h_query, p=self.drop_, training=self.training )
        
        h_key = self.key_emb( doc_tids )
        h_key = h_key + h_TFs + h_DFs
        #h_key = torch.tanh( h_key )
        h_key = F.dropout( h_key, p=self.drop_, training=self.training )
        
        co_weights  = torch.bmm( h_key, h_query.transpose( 1, 2 ) )
        #co_weights = torch.tanh( co_weights )
        #co_weights  = co_weights / torch.pow(1.+co_weights, 2.)
        co_weights  = F.leaky_relu( co_weights, negative_slope=self.negative_slope)
        
        #co_weights[pad_mask.logical_not()] = 0. # Set the 3D-pad mask values to
        #co_weights = torch.tanh(co_weights)
        
        co_weights[pad_mask.logical_not()] = float('-inf') # Set the 3D-pad mask values to -inf (=0 in sigmoid)
        co_weights = torch.sigmoid(co_weights)
        
        weights = co_weights.sum(axis=2) / doc_sizes
        weights[bx_packed] = float('-inf') # Set the 2D-pad mask values to -inf  (=0 in softmax)
        
        weights = torch.softmax(weights, dim=1)
        weights = torch.where(torch.isnan(weights), torch.zeros_like(weights), weights)
        weights = weights.view( *weights.shape, 1 )
        
        h_value = self.value_emb( doc_tids )
        h_value = h_value + h_TFs + h_DFs
        h_value = F.dropout( h_value, p=self.drop_, training=self.training )
        
        docs_h = h_value * weights
        docs_h = docs_h.sum(axis=1)
        docs_h = F.dropout( docs_h, p=self.drop_, training=self.training )
        docs_h = self.fc(docs_h)
        return docs_h, weights, co_weights
    
    def init_weights(self):
        self.TF_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.DF_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.query_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.key_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.value_emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.fc.weight.data.uniform_(-self.initrange, self.initrange)
        

In [11]:
nepochs = 1000
max_epochs = 30
drop=0.8
max_drop=.8 # default .8
device = torch.device('cuda:0')
batch_size = 32 # default 32
k = 128 #

In [12]:
def collate_train(param):
    X, y = zip(*param)
    doc_tids, TFs, DFs = tokenizer.transform(X, verbose=False)
    doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)), batch_first=True, padding_value=0)
    
    TFs = pad_sequence(list(map(torch.tensor, TFs)), batch_first=True, padding_value=0)
    TFs = torch.LongTensor(torch.log2(TFs+1).round().long())
    
    DFs = pad_sequence(list(map(torch.tensor, DFs)), batch_first=True, padding_value=0)
    DFs = torch.LongTensor(torch.log2(DFs+1).round().long())
    
    return doc_tids, TFs, DFs, torch.LongTensor(y)

In [13]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f4294136070>

In [14]:
torch.device('cuda:0')

device(type='cuda', index=0)

In [15]:
#sc = SimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout=drop).to( device )
ab = AttentionTFIDF_V1(tokenizer.vocab_size, 300, tokenizer.n_class,
                       initrange=0.2, drop=drop).to( device )
#ab = AttentionBag(tokenizer.vocab_size, 300, tokenizer.n_class, drop=drop).to( device )
#ab = NotTooSimpleClassifier(tokenizer.vocab_size, 300, tokenizer.n_class, dropout1=drop, dropout2=drop).to( device )
tokenizer.k = k
optimizer = optim.AdamW( ab.parameters(), lr=5e-2, weight_decay=5e-4)
#optimizer = optim.AdamW( ab.parameters(), lr=5e-3, weight_decay=5e-3)
loss_func_cel = nn.CrossEntropyLoss().to( device )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.95,
                                                       patience=10, verbose=True)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=.98, verbose=True)

In [16]:
num_workers=32

In [None]:
best = 99999.
counter = 1
loss_val = 1.
eps = .9
dl_val = DataLoader(list(zip(fold.X_val, y_val)), batch_size=batch_size,
                         shuffle=False, collate_fn=collate_train, num_workers=num_workers)
for e in tqdm(range(nepochs), total=nepochs):
    dl_train = DataLoader(list(zip(fold.X_train, y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate_train, num_workers=num_workers)
    loss_train  = 0.
    with tqdm(total=len(y_train)+len(y_val), smoothing=0., desc=f"Epoch {e+1}") as pbar:
        total = 0
        correct  = 0
        ab.train()
        tokenizer.model = 'sample'
        tokenizer.k = k
        for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train):
            doc_tids = doc_tids.to( device )
            TFs      = TFs.to( device )
            DFs      = DFs.to( device )
            y        = y.to( device )
            
            pred_docs,_,_ = ab( doc_tids, TFs, DFs )
            pred_docs     = torch.softmax(pred_docs, dim=1)
            loss          = loss_func_cel(pred_docs, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()
            total      += len(y)
            y_pred      = pred_docs.argmax(axis=1)
            correct    += (y_pred == y).sum().item()
            #ab.drop_ =  np.power((correct/total),loss_val)
            #ab.drop_ =  np.power((correct/total),4)
            ab.drop_ =  (correct/total)*max_drop
            
            toprint  = f"Train loss: {loss_train/(i+1):.5}/{loss.item():.5} "
            toprint += f'Drop: {ab.drop_:.5} '
            toprint += f'ACC: {correct/total:.5} '
            
            print(toprint, end=f"{' '*100}\r")
            
            pbar.update( len(y) )
            del doc_tids, TFs
            del DFs, y, pred_docs
            del loss, y_pred
        loss_train = loss_train/(i+1)
        print()
        #print(ab.drop_)
        total = 0
        correct  = 0
        ab.eval()
        tokenizer.model = 'topk'
        tokenizer.k = 512
        with torch.no_grad():
            loss_val = 0.
            for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val):
                doc_tids = doc_tids.to( device )
                TFs      = TFs.to( device )
                DFs      = DFs.to( device )
                y        = y.to( device )

                pred_docs,_,_ = ab( doc_tids, TFs, DFs )
                pred_docs     = torch.softmax(pred_docs, dim=1)
                loss          = loss_func_cel(pred_docs, y)

                loss_val   += loss.item()
                total      += len(y)
                y_pred      = pred_docs.argmax(axis=1)
                correct    += (y_pred == y).sum().item()
                
                print(f'Val loss: {loss_val/(i+1):.5} ACC: {correct/total:.5}', end=f"{' '*100}\r")
                pbar.update( len(y) )
                
                del doc_tids, TFs, DFs, y
                del pred_docs, loss
            print()
            loss_val   = (loss_val/(i+1))
            scheduler.step(loss_val)

            if best-loss_val > 0.0001 :
                best = loss_val
                counter = 1
                print(f'New Best Val loss: {best:.5}', end=f"{' '*100}\n")
                best_model = copy.deepcopy(ab).to('cpu')
            elif counter > max_epochs:
                print(f'Best Val loss: {best:.5}', end=f"{' '*100}\n")
                break
            else:
                counter += 1

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/22402 [00:00<?, ?it/s]

Train loss: 2.28/2.543 Drop: 0.2107 ACC: 0.26337                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
Val loss: 2.2793 ACC: 0.26373                                                                                                    

In [None]:
device_test = 'cpu'
ab = copy.deepcopy(best_model).to(device_test)
ab.eval()
loss_total = 0
correct_t = 0
total_t = 0
dl_test = DataLoader(list(zip(fold.X_test, y_test)), batch_size=128,
                         shuffle=False, collate_fn=collate_train, num_workers=num_workers)
tokenizer.k = 256
for i, (docs_tids_t, TFs_t, DFs_t, y_t) in enumerate(dl_test):
    docs_tids_t = docs_tids_t.to( device_test )
    TFs_t       = TFs_t.to( device_test )
    DFs_t       = DFs_t.to( device_test )
    y_t         = y_t.to( device_test )

    pred_docs_t,weigths,coweights = ab( docs_tids_t, TFs_t, DFs_t )
    sofmax_docs_t = torch.softmax(pred_docs_t, dim=1)

    y_pred_t    = sofmax_docs_t.argmax(axis=1)
    correct_t  += (y_pred_t == y_t).sum().item()
    total_t    += len(y_t)
    loss_total += loss_func_cel(sofmax_docs_t, y_t)

    print(f'Test loss: {loss_total.item()/(i+1):.5} ACC: {correct_t/total_t:.5}', end=f"{' '*100}\r")

In [None]:
v1 = torch.FloatTensor([[1,0,0],[0,1,0],[0,0,1]])