In [1]:
from utils import Dataset, Graphsize
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from time import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [113]:
webkb = Dataset('/home/mangaravite/Documents/datasets/topics/webkb/')
_20ng = Dataset('/home/mangaravite/Documents/datasets/topics/20ng/')
acm   = Dataset('/home/mangaravite/Documents/datasets/topics/acm/')
reut  = Dataset('/home/mangaravite/Documents/datasets/topics/reut/')

dataset = reut

In [114]:
fold = next(dataset.get_fold_instances(5))
fold._fields

('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val')

In [115]:
%%time
max_feats = 25000
graph_builder = Graphsize(w=5, verbose=True, max_feat=max_feats, feature_type='full_weight_prob')
Gs_train = graph_builder.fit_transform(fold.X_train)
Gs_val   = graph_builder.transform(fold.X_val)

100%|██████████| 7923/7923 [00:13<00:00, 590.32it/s]
100%|██████████| 7923/7923 [00:04<00:00, 1607.15it/s]
100%|██████████| 7923/7923 [01:15<00:00, 104.52it/s]
100%|██████████| 7923/7923 [00:14<00:00, 542.23it/s]
100%|██████████| 2702/2702 [00:01<00:00, 1782.50it/s]
100%|██████████| 2702/2702 [00:25<00:00, 107.61it/s]
100%|██████████| 2702/2702 [00:05<00:00, 508.13it/s]


CPU times: user 2min 19s, sys: 1.41 s, total: 2min 21s
Wall time: 2min 20s


In [116]:
Gs_test  = graph_builder.transform(fold.X_test)

100%|██████████| 2702/2702 [00:01<00:00, 1722.08it/s]
100%|██████████| 2702/2702 [00:26<00:00, 102.88it/s]
100%|██████████| 2702/2702 [00:04<00:00, 569.50it/s]


In [117]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

import torch.optim as optim
from torch.utils.data import DataLoader

In [154]:
class SimpleClassifierGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, drop=.5):
        super(SimpleClassifierGCN, self).__init__()

        self.layers = nn.ModuleList([
            #GraphConv(in_dim, hidden_dim, activation=F.leaky_relu).to(torch.device('cuda:0')),
            #GraphConv(hidden_dim, hidden_dim, activation=F.leaky_relu).to(torch.device('cuda:0'))
            GraphConv(in_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))
        ])
        
        self.lin = nn.Linear(hidden_dim, 1).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear(hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            #h = h.view(h.shape[0], -1)
        G.ndata['h'] = h
        w = self.lin( h )
        G.ndata['w'] = w
        hg = dgl.mean_nodes(G, 'h', weight='w')
        
        pred = self.classify( hg )
        #pred = torch.softmax(pred, 1)
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            #h = h.view(h.shape[0], -1)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        return hg
class ClassifierGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, drop=.5):
        super(ClassifierGCN, self).__init__()

        self.encoder = nn.Linear(in_dim, hidden_dim).to(torch.device('cuda:0'))
        
        self.layers = nn.ModuleList([
            GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))
        ])
        
        self.lin = nn.Linear(2*hidden_dim, 1).to(torch.device('cuda:0'))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear(2*hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        he = self.encoder(h)
        h = he
        for conv in self.layers:
            h = conv(G, h)
        hg = self.norm( h )
        hg = self.drop( hg )
        
        # CONCAT he E hg
        hg = torch.cat((hg,he), 1)
        hg = self.pooling(G, hg)
        
        pred = self.classify( hg )
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        return hg
    
class ClassifierGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, n_heads=16, drop=.5):
        super(ClassifierGAT, self).__init__()

        self.layers = nn.ModuleList([
            GATConv(in_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0')),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0'))
        ])
        
        self.lin = nn.Linear(n_heads*hidden_dim, 1).to(torch.device('cuda:0'))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( n_heads*hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear(n_heads*hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        pred = self.classify( hg )
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        return hg

In [155]:
a = torch.tensor( [[1,2,3,4,5],[5,4,3,2,1]] )
b = torch.tensor( [[-1,-2,-3,-4,-5],[-5,-4,-3,-2,-1]] )

In [156]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [157]:
class FocalLoss(nn.Module):
    # https://github.com/mbsariyildiz/focal-loss.pytorch
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input, dim=1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [180]:
PATH = 'best_param_simple_reut.pth'
n_epochs = 100
patience = 25
hidden_dim = 300
train_batch_size = 16
test_val_batch_size = 256

#model = SimpleClassifierGCN(len(graph_builder.vocab), hidden_dim, dataset.nclass, drop=.5).to(torch.device('cuda:0'))
#model = ClassifierGAT(len(graph_builder.vocab), hidden_dim, dataset.nclass, n_heads=2, drop=.5).to(torch.device('cuda:0'))
model = ClassifierGCN(len(graph_builder.vocab), hidden_dim, dataset.nclass, drop=.3).to(torch.device('cuda:0'))

In [181]:
loss_func = FocalLoss(gamma=2).to(torch.device('cuda:0'))
#loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

#optimizer = optim.Adam( model.parameters(), lr=1e-2, weight_decay=1e-3)
optimizer = optim.AdamW( model.parameters(), lr=1e-2, weight_decay=1e-4)

#optimizer = optim.RMSprop( model.parameters(), lr=1e-2, weight_decay=1e-4)
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()
epoch_losses = []

In [182]:
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)

In [183]:
best_score = None
n_iters = 0

for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                             shuffle=True, collate_fn=collate)
    epoch_loss = 0
    with tqdm(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        total = 0
        correct = 0
        model.train()
        for i, (bg, label) in enumerate(data_loader):
            outputs = model(bg)
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
            
            # Train eval phase
            total += label.size(0)
            correct += (sampled_Y == label).sum().item()
            
            # NN backprop phase
            loss = loss_func(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            
            del loss, outputs, bg, probs_Y, sampled_Y
            pbar.update( len(label) )
            pbar.set_description_str('iter {}, train acc {:.3f} train loss {:.2f}'.format(epoch, (correct/total), epoch_loss / (epoch + 1)))
        
        score_train = correct/total
    with tqdm(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
        model.eval()
        total = 0
        correct = 0
        epoch_loss = 0.
        for bg, label in data_loader_val:
            with torch.no_grad():
                outputs = model(bg)
            
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)

            # Validation eval phase
            total += label.size(0)
            correct += (sampled_Y == label).sum().item()
            
            #break
            
            del probs_Y, outputs, bg, sampled_Y
            pbar.update( label.size(0) )
            score_val = correct/total

            pbar.set_description_str('iter {}, val   acc {:.3f} ( over: {:.3} )'.format(epoch, score_val, score_val/score_train))
            
        #break
        pbar.set_description_str('iter {}, val  acc {:.3f} ( over: {:.3}/{} )'.format(epoch, score_val, score_val/score_train, n_iters))
        score = correct/total
        if best_score is None or score > best_score:
            torch.save(model, PATH)
            best_score = score
            n_iters = 0
        else:
            n_iters += 1
            if n_iters >= patience:
                print()
                print('BEST val acc {:.3f}'.format(best_score), end='\r')
                break
        pbar.set_description_str('iter {}, val   acc {:.3f} ( over: {:.3}/{} )'.format(epoch, score_val, score_val/score_train, n_iters))
        epoch_loss /= (epoch + 1)
        epoch_losses.append(epoch_loss)

iter 0, train acc 0.581 train loss 365.28: 100%|██████████| 7923/7923 [00:37<00:00, 209.66it/s]
iter 0, val   acc 0.602 ( over: 1.04 ):  66%|██████▋   | 1792/2702 [00:08<00:04, 206.59it/s]


KeyboardInterrupt: 

In [None]:
# leaky_relu

#ReLU
#BEST val acc 0.702 reut  w=5 | full_weight_prob
#BEST val acc 0.622 acm   w=5 | full_weight_prob

#BEST val acc 0.732 webkb w=5 | full_weight
#BEST val acc 0.700 reut  w=5 | full_weight
#BEST val acc 0.754 20ng  w=5 | full_weight

In [None]:
#BEST val acc 0.623: 
PATH = 'best_param_simple_acm.pth'
n_epochs = 100
patience = 10
hidden_dim = 300
train_batch_size = 16
test_val_batch_size = 256
loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
optimizer = optim.Adam( model.parameters(), lr=1e-2, weight_decay=1e-3)

In [61]:
model = torch.load(PATH)
with torch.no_grad():
    total = 0
    correct = 0
    epoch_loss = 0.
    model.eval()
    for bg, label in data_loader_val:
        outputs = model(bg)
        probs_Y = torch.softmax(outputs, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        #print(probs_Y.shape, sampled_Y.shape, label.shape)

        qtd_docs = label.size(0)
        qtd_correct = (sampled_Y == label).sum().item()
        total += qtd_docs
        correct += qtd_correct

        del probs_Y, outputs, bg, sampled_Y
        pbar.update( label.size(0) )
        score_val = correct/total

        print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
    #break

val acc 0.613 val loss 0.0 ( 3057/4983. over: 0.674 )

In [None]:
print( len(Gs_train), '+', len(Gs_val), end=' = ' )
Gs_train_val = Gs_train + Gs_val
print( len(Gs_train_val) )

print( len(fold.y_train), '+', len(fold.y_val), end=' = ' )
y_train_val = fold.y_train + fold.y_val
print( len(y_train_val) )

In [None]:
data_loader_train_val_t  = DataLoader(list(zip(Gs_train_val, y_train_val)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate)
X_train_val_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_train_val_t.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_train_val_t:
            X_train_val_t = model.transform( G ).cpu().numpy()
            X_train_val_all.append( X_train_val_t )
            pbar.update( len(label) )

In [None]:
X_train_val_all2 = np.concatenate( X_train_val_all )
X_train_val_all2

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param = {'loss': 'squared_hinge', 'C': 1, 'verbose': 0,
         'intercept_scaling': 1, 'fit_intercept': True,
         'max_iter': 1000, 'penalty': 'l2', 'multi_class': 'ovr',
         'random_state': None, 'dual': False,'tol': 0.001,
         'class_weight': None}
estimator = LinearSVC(**param)
tunning = [{'C': 2.0 ** np.arange(-5, 9, 2)}]

gs = GridSearchCV(estimator, tunning,
                n_jobs=64, refit=False,
                cv=5, iid=True,
                verbose=2, scoring='f1_micro')

gs.fit( X_train_val_all2, y_train_val )

In [None]:
Gs_test  = graph_builder.transform(fold.X_test)
data_loader_test = DataLoader(list(zip(Gs_test, fold.y_test)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate)
X_test_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_test.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_test:
            X_test_t = model.transform( G ).cpu().numpy()
            X_test_all.append( X_test_t )
            pbar.update( len(label) )
X_test_all = np.concatenate( X_test_all )

In [None]:
lsvm = LinearSVC( **gs.best_params_ )
lsvm.fit( X_train_val_all2, y_train_val )

y_pred = lsvm.predict( X_test_all )

In [None]:
sum(y_pred == fold.y_test)/len(y_pred)

In [None]:
model = torch.load(PATH)
with torch.no_grad():
    total = 0
    correct = 0
    epoch_loss = 0.
    model.eval()
    for bg, label in data_loader_test:
        outputs = model(bg)
        probs_Y = torch.softmax(outputs, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        #print(probs_Y.shape, sampled_Y.shape, label.shape)

        qtd_docs = label.size(0)
        qtd_correct = (sampled_Y == label).sum().item()
        total += qtd_docs
        correct += qtd_correct

        del probs_Y, outputs, bg, sampled_Y
        pbar.update( label.size(0) )
        score_val = correct/total

        print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
    #break

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

class FocalLoss(nn.Module):
    # https://github.com/mbsariyildiz/focal-loss.pytorch
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input, dim=1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

class ClassifierGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, drop=.5, device='cuda:0'):
        super(ClassifierGCN, self).__init__()

        self.layers = nn.ModuleList([
            GraphConv(in_dim, hidden_dim, activation=F.leaky_relu).to(torch.device(device)),
            GraphConv(hidden_dim, hidden_dim, activation=F.leaky_relu).to(torch.device(device))
        ])
        
        self.lin = nn.Linear(hidden_dim, 1).to(torch.device(device))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device(device))
        
        self.norm = nn.BatchNorm1d( hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear(hidden_dim, n_classes).to(torch.device(device))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, h)
        pred = self.classify( hg )
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
        hg = self.norm( h )
        hg = self.drop( hg )
        hg = self.pooling(G, h)
        return hg

PATH = 'best_param_simple.pth'
class TGA(BaseEstimator, TransformerMixin, ClassifierMixin):
    def __init__(self, hidden_dim=300,
                 drop=.5, n_epochs=100, patience=10, hidden_dim,
                 train_batch_size=16, transform_batch_size=256,
                 lr=1e-2, weight_decay=1e-4,
                 optim_func='adam', loss_func='focal', 
                 device='cuda:0', verbose=False):
        self.hidden_dim=hidden_dim
        self.drop=drop
        self.n_epochs=n_epochs
        self.patience=patience
        self.train_batch_size=train_batch_size
        self.transform_batch_size=transform_batch_size
        self.lr=lr
        self.weight_decay=weight_decay
        self.device=device
        
        if loss_func.lower() == 'focal':
            self.loss_func = FocalLoss().to(torch.device(device))  
        elif loss_func.lower() == 'cross_entropy':
            self.loss_func = nn.CrossEntropyLoss().to(torch.device(device))
        
        if optim_func.lower() == 'focal':
            self.optimizer = optim.Adam( model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        elif optim_func.lower() == 'rmsprop':
            self.optimizer = optim.RMSprop( model.parameters(), lr=self.lr, weight_decay=self.weight_decay )
        
    def fit(self, X_train, y_train, X_val=None, y_val=None):
        
        in_dim= #Tamanho Vocab
        self.nclass= #Quantidade de Classes
        
        model.train()
        torch.cuda.synchronize()
        
        self.model = ClassifierGCN(in_dim, self.hidden_dim, self.nclass, drop=self.drop).to(torch.device(self.device))
        
        best_score = None
        n_iters = 0

        for epoch in range(self.n_epochs):
            data_loader = DataLoader(list(zip(Gs_train, y_train)), batch_size=self.train_batch_size,
                                     shuffle=True, collate_fn=collate)
            epoch_loss = 0
            with tqdm_notebook(total=len(data_loader.dataset), smoothing=0.) as pbar:
                t0 = time()
                total = 0
                correct = 0
                model.train()
                for i, (bg, label) in enumerate(data_loader):
                    outputs = model(bg)
                    probs_Y = torch.softmax(outputs, 1)
                    sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)

                    # Train eval phase
                    total += label.size(0)
                    correct += (sampled_Y == label).sum().item()

                    # NN backprop phase
                    loss = loss_func(outputs, label)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.detach().item()

                    del loss, outputs, bg, probs_Y, sampled_Y
                    pbar.update( len(label) )
                    print('iter {}, train loss {:.2f} train acc: {:.3f}'.format(epoch, epoch_loss / (epoch + 1), (correct/total)), end='\r')

                score_train = correct/total
            with tqdm_notebook(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
                model.eval()
                total = 0
                correct = 0
                epoch_loss = 0.
                for bg, label in data_loader_val:
                    with torch.no_grad():
                        outputs = model(bg)

                    probs_Y = torch.softmax(outputs, 1)
                    sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)

                    # Validation eval phase
                    total += label.size(0)
                    correct += (sampled_Y == label).sum().item()

                    #break

                    del probs_Y, outputs, bg, sampled_Y
                    pbar.update( label.size(0) )
                    score_val = correct/total

                    print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')

                #break
                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                score = correct/total
                if best_score is None or score > best_score:
                    torch.save(model, PATH)
                    best_score = score
                    n_iters = 0
                else:
                    n_iters += 1
                    if n_iters >= patience:
                        print()
                        print('BEST val acc {:.3f}'.format(best_score), end='\r')
                        break
                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                epoch_loss /= (epoch + 1)
                epoch_losses.append(epoch_loss)
            print()
        
        
        epoch_losses = []
    def transform(self, X):
        pass
    def predict(self, X):
        pass