In [1]:
from utils import Dataset, Graphsize
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from time import time
import numpy as np

In [2]:
dataset = Dataset('/home/mangaravite/Documents/datasets/topics/acm/')

In [3]:
fold = next(dataset.get_fold_instances(10))
fold._fields

('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val')

In [5]:
%%time
max_feats = 250000
graph_builder = Graphsize(w=5, verbose=True, max_feat=max_feats)
Gs_train = graph_builder.fit_transform(fold.X_train)
Gs_val   = graph_builder.transform(fold.X_val)
#Gs_test  = graph_builder.transform(fold.X_test)

100%|██████████| 19907/19907 [00:13<00:00, 1449.35it/s]
100%|██████████| 19907/19907 [00:05<00:00, 3861.63it/s]
100%|██████████| 19907/19907 [01:06<00:00, 299.39it/s]
100%|██████████| 19907/19907 [00:12<00:00, 1577.99it/s]
100%|██████████| 2495/2495 [00:00<00:00, 3950.38it/s]
100%|██████████| 2495/2495 [00:07<00:00, 325.67it/s]
100%|██████████| 2495/2495 [00:01<00:00, 1630.88it/s]

CPU times: user 1min 47s, sys: 932 ms, total: 1min 48s
Wall time: 1min 47s





In [6]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

import torch.optim as optim
from torch.utils.data import DataLoader

In [7]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_heads, n_classes, drop=0.5, k=2):
        super(Classifier, self).__init__()

        self.layers = nn.ModuleList([
            #GraphConv(in_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            #GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))])
            GATConv(in_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0')),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0'))
        ])
        self.lin = nn.Linear(n_heads*hidden_dim, 1).to(torch.device('cuda:0'))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( n_heads*hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.conv = nn.Linear(n_heads*hidden_dim, k*hidden_dim).to(torch.device('cuda:0'))
        
        self.classify = nn.Linear(k*hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.pooling(G, h)
        #G.ndata['h'] = h
        #w = self.lin( h )
        #G.ndata['w'] = w
        #hg = dgl.mean_nodes(G, 'h', weight='w')
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.conv( hg )
        pred = self.classify( hg )
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.pooling(G, h)
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.conv(hg)
        return hg
    def predict(self, G):
        hg = self.transform( G )
        pred = self.classify( hg )
        probs_Y = torch.softmax(pred, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        return sampled_Y
        

In [8]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [108]:
class FocalLoss(nn.Module):
    # https://github.com/mbsariyildiz/focal-loss.pytorch
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input, dim=1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [109]:
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)
best_score = None
for (noised,qtd_noised) in [ (False, 0), (True, 3), (True, 5) ]:
    for lr in [ .01, .001, .0001 ]:
        for drop in [ .1, .3, .5, .7 ]:
            for l2 in [ 5e-1, 5e-3, 5e-5 ]:
                # hyper-params
                PATH = 'best_param.pth'
                n_epochs = 100
                patience = 10
                hidden_dim = 300
                n_heads = 16
                train_batch_size = 16
                test_val_batch_size = 256

                model = Classifier(len(graph_builder.vocab), hidden_dim, n_heads, dataset.nclass, drop=drop).to(torch.device('cuda:0'))

                #loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
                loss_func = FocalLoss().to(torch.device('cuda:0'))
                loss_eval_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

                optimizer = optim.Adam( model.parameters(), lr=lr, weight_decay=l2)

                model.train()
                torch.cuda.synchronize()
                epoch_losses = []
                n_iters = 0
                
                print("####### lr:{:.3} drop:{:.3} l2:{:.3}".format(lr, drop, l2))
                for epoch in range(n_epochs):
                    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                                             shuffle=True, collate_fn=collate)
                    epoch_loss = 0
                    with tqdm_notebook(total=len(data_loader.dataset), smoothing=0., position=0) as pbar:
                        t0 = time()
                        total = 0
                        correct = 0
                        model.train()
                        for i, (bg, label) in enumerate(data_loader):
                            if noised and i % qtd_noised == 0:
                                np.random.shuffle(label)
                            outputs = model(bg)
                            probs_Y = torch.softmax(outputs, 1)
                            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                            loss = loss_func(outputs, label)
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()
                            epoch_loss += loss.detach().item()

                            qtd_docs = label.size(0)
                            qtd_correct = (sampled_Y == label).sum().item()
                            total += qtd_docs
                            correct += qtd_correct

                            del loss, outputs, bg, probs_Y, sampled_Y
                            pbar.update( len(label) )
                            print('iter {}, train loss {:.2f} train acc: {:.3f}'.format(epoch, epoch_loss / (epoch + 1), (correct/total)), end='\r')
                        score_train = correct/total
                    with tqdm_notebook(total=len(data_loader_val.dataset), smoothing=0., position=0) as pbar:
                        with torch.no_grad():
                            total = 0
                            correct = 0
                            epoch_loss = 0
                            model.eval()
                            for bg, label in data_loader_val:
                                outputs = model(bg)
                                probs_Y = torch.softmax(outputs, 1)
                                sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                                #print(probs_Y.shape, sampled_Y.shape, label.shape)
                                loss = loss_eval_func(outputs, label)
                                epoch_loss += loss.item()

                                qtd_docs = label.size(0)
                                qtd_correct = (sampled_Y == label).sum().item()
                                total += qtd_docs
                                correct += qtd_correct

                                del probs_Y, outputs, bg, sampled_Y
                                pbar.update( label.size(0) )
                                score_val = correct/total

                                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
                                #break
                            #break
                            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                            score = correct/total
                            if best_score is None or score > best_score:
                                torch.save(model, PATH)
                                best_score = score
                                n_iters = 0
                            else:
                                n_iters += 1
                                if n_iters >= patience:
                                    print()
                                    print('BEST val acc {:.3f}'.format(best_score), end='\r')
                                    break
                            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                        epoch_loss /= (epoch + 1)
                        epoch_losses.append(epoch_loss)
                    print()

####### lr:0.01 drop:0.1 l2:0.5


iter 0, train loss 3492.64 train acc: 0.256


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/0 )

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/0 )



iter 1, train loss 1428.55 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/1 )



iter 2, train loss 952.25 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/2 )



iter 3, train loss 714.11 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/3 )



iter 4, train loss 571.44 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/4 )



iter 5, train loss 476.12 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 6, train loss 408.10 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/6 )



iter 7, train loss 357.10 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/7 )



iter 8, train loss 317.40 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/8 )



iter 9, train loss 285.71 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )



iter 10, train loss 259.71 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.1 l2:0.005


iter 0, train loss 5400.06 train acc: 0.254


val acc 0.258 val loss 21.1 ( 644/2495. over: 1.01/1 )



iter 1, train loss 1230.97 train acc: 0.271


val acc 0.214 val loss 29.7 ( 534/2495. over: 0.789/2 )



iter 2, train loss 823.04 train acc: 0.271


val acc 0.261 val loss 21.1 ( 651/2495. over: 0.964/3 )



iter 3, train loss 631.36 train acc: 0.262


val acc 0.190 val loss 22.6 ( 475/2495. over: 0.727/4 )



iter 4, train loss 533.55 train acc: 0.233


val acc 0.171 val loss 21.7 ( 427/2495. over: 0.733/5 )



iter 5, train loss 1772.59 train acc: 0.228


val acc 0.215 val loss 1.39e+02 ( 536/2495. over: 0.944/6 )



iter 6, train loss 396.59 train acc: 0.261


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/7 )



iter 7, train loss 313.01 train acc: 0.266


val acc 0.263 val loss 20.5 ( 656/2495. over: 0.988/8 )



iter 8, train loss 278.21 train acc: 0.266


val acc 0.259 val loss 21.0 ( 645/2495. over: 0.972/9 )



iter 9, train loss 247.84 train acc: 0.273


val acc 0.255 val loss 21.0 ( 636/2495. over: 0.934/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.1 l2:5e-05


iter 0, train loss 9437.40 train acc: 0.272


val acc 0.239 val loss 27.1 ( 597/2495. over: 0.881/1 )



iter 1, train loss 1067.84 train acc: 0.432


val acc 0.203 val loss 24.6 ( 507/2495. over: 0.471/2 )



iter 2, train loss 684.70 train acc: 0.453


val acc 0.177 val loss 27.5 ( 442/2495. over: 0.391/3 )



iter 3, train loss 565.63 train acc: 0.419


val acc 0.138 val loss 28.4 ( 344/2495. over: 0.329/4 )



iter 4, train loss 2214.78 train acc: 0.353


val acc 0.201 val loss 28.1 ( 501/2495. over: 0.569/5 )



iter 5, train loss 340.77 train acc: 0.451


val acc 0.232 val loss 25.2 ( 579/2495. over: 0.515/6 )



iter 6, train loss 292.82 train acc: 0.445


val acc 0.219 val loss 26.3 ( 546/2495. over: 0.492/7 )



iter 7, train loss 2024.67 train acc: 0.343


val acc 0.176 val loss 28.8 ( 440/2495. over: 0.515/8 )



iter 8, train loss 239.09 train acc: 0.427


val acc 0.192 val loss 24.6 ( 480/2495. over: 0.45/9 )



iter 9, train loss 206.03 train acc: 0.446


val acc 0.222 val loss 24.4 ( 554/2495. over: 0.497/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.3 l2:0.5


iter 0, train loss 3381.73 train acc: 0.254


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.04/1 )



iter 1, train loss 1428.57 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/2 )



iter 2, train loss 952.36 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.25 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.40 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.23 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.12 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.11 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/8 )



iter 8, train loss 317.46 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.70 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.3 l2:0.005


iter 0, train loss 6403.12 train acc: 0.237


val acc 0.261 val loss 20.4 ( 652/2495. over: 1.1/1 )



iter 1, train loss 1258.01 train acc: 0.265


val acc 0.236 val loss 25.7 ( 589/2495. over: 0.89/2 )



iter 2, train loss 841.44 train acc: 0.264


val acc 0.263 val loss 20.3 ( 657/2495. over: 0.998/3 )



iter 3, train loss 632.56 train acc: 0.262


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.01/4 )



iter 4, train loss 3762.84 train acc: 0.221


val acc 0.263 val loss 20.8 ( 655/2495. over: 1.19/5 )



iter 5, train loss 422.15 train acc: 0.261


val acc 0.193 val loss 20.5 ( 482/2495. over: 0.739/6 )



iter 6, train loss 360.43 train acc: 0.263


val acc 0.259 val loss 20.6 ( 646/2495. over: 0.986/7 )



iter 7, train loss 314.44 train acc: 0.264


val acc 0.260 val loss 20.7 ( 648/2495. over: 0.983/8 )



iter 8, train loss 280.38 train acc: 0.262


val acc 0.263 val loss 20.5 ( 656/2495. over: 1.0/9 )



iter 9, train loss 1374.24 train acc: 0.228


val acc 0.192 val loss 43.5 ( 479/2495. over: 0.842/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.3 l2:5e-05


iter 0, train loss 12446.76 train acc: 0.205


val acc 0.241 val loss 24.5 ( 601/2495. over: 1.17/1 )



iter 1, train loss 1272.90 train acc: 0.297


val acc 0.262 val loss 24.5 ( 653/2495. over: 0.881/2 )



iter 2, train loss 805.58 train acc: 0.330


val acc 0.224 val loss 26.3 ( 558/2495. over: 0.678/3 )



iter 3, train loss 5202.36 train acc: 0.253


val acc 0.253 val loss 36.6 ( 631/2495. over: 1.0/4 )



iter 4, train loss 612.15 train acc: 0.260


val acc 0.239 val loss 22.1 ( 596/2495. over: 0.917/5 )



iter 5, train loss 404.29 train acc: 0.307


val acc 0.257 val loss 22.4 ( 640/2495. over: 0.836/6 )



iter 6, train loss 341.05 train acc: 0.311


val acc 0.238 val loss 24.7 ( 595/2495. over: 0.766/7 )



iter 7, train loss 3741.72 train acc: 0.236


val acc 0.195 val loss 43.6 ( 487/2495. over: 0.827/8 )



iter 8, train loss 326.47 train acc: 0.259


val acc 0.255 val loss 23.1 ( 636/2495. over: 0.985/9 )



iter 9, train loss 242.92 train acc: 0.309


val acc 0.234 val loss 22.7 ( 585/2495. over: 0.758/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.5 l2:0.5


iter 0, train loss 3417.00 train acc: 0.255


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.39 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.41 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.18 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.43 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.16 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/6 )



iter 6, train loss 408.13 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/7 )



iter 7, train loss 357.12 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/8 )



iter 8, train loss 317.46 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/9 )



iter 9, train loss 285.62 train acc: 0.263


val acc 0.189 val loss 23.0 ( 472/2495. over: 0.718/9 )
BEST val acc 0.263
####### lr:0.01 drop:0.5 l2:0.005


iter 0, train loss 6778.13 train acc: 0.223


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.18/1 )



iter 1, train loss 1263.67 train acc: 0.263


val acc 0.264 val loss 20.3 ( 658/2495. over: 1.0/0 )



iter 2, train loss 844.07 train acc: 0.264


val acc 0.263 val loss 20.4 ( 657/2495. over: 0.999/1 )



iter 3, train loss 652.64 train acc: 0.249


val acc 0.259 val loss 35.0 ( 647/2495. over: 1.04/2 )



iter 4, train loss 4115.37 train acc: 0.223


val acc 0.263 val loss 20.7 ( 655/2495. over: 1.18/3 )



iter 5, train loss 425.45 train acc: 0.256


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.03/4 )



iter 6, train loss 362.29 train acc: 0.260


val acc 0.263 val loss 20.4 ( 656/2495. over: 1.01/5 )



iter 7, train loss 316.52 train acc: 0.262


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/6 )



iter 8, train loss 282.84 train acc: 0.256


val acc 0.259 val loss 21.3 ( 645/2495. over: 1.01/7 )



iter 9, train loss 2409.20 train acc: 0.212


val acc 0.263 val loss 20.7 ( 656/2495. over: 1.24/8 )



iter 10, train loss 231.34 train acc: 0.263


val acc 0.248 val loss 21.7 ( 620/2495. over: 0.947/9 )



iter 11, train loss 210.75 train acc: 0.263


val acc 0.262 val loss 20.8 ( 654/2495. over: 0.996/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.5 l2:5e-05


iter 0, train loss 23177.93 train acc: 0.161


val acc 0.245 val loss 28.3 ( 612/2495. over: 1.52/1 )



iter 1, train loss 1631.39 train acc: 0.198


val acc 0.190 val loss 21.9 ( 474/2495. over: 0.959/2 )



iter 2, train loss 930.20 train acc: 0.208


val acc 0.224 val loss 22.2 ( 560/2495. over: 1.08/3 )



iter 3, train loss 756.52 train acc: 0.203


val acc 0.038 val loss 88.1 ( 96/2495. over: 0.19/4 )



iter 4, train loss 8140.30 train acc: 0.170


val acc 0.192 val loss 27.4 ( 478/2495. over: 1.13/5 )



iter 5, train loss 550.84 train acc: 0.200


val acc 0.139 val loss 25.3 ( 348/2495. over: 0.699/6 )



iter 6, train loss 403.16 train acc: 0.217


val acc 0.241 val loss 21.8 ( 602/2495. over: 1.11/7 )



iter 7, train loss 335.46 train acc: 0.228


val acc 0.175 val loss 23.0 ( 437/2495. over: 0.769/8 )



iter 8, train loss 4793.54 train acc: 0.185


val acc 0.255 val loss 78.4 ( 636/2495. over: 1.38/9 )



iter 9, train loss 533.77 train acc: 0.179


val acc 0.256 val loss 23.5 ( 638/2495. over: 1.43/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:0.5


iter 0, train loss 3185.40 train acc: 0.255


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.35 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.29 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/3 )



iter 3, train loss 714.24 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/4 )



iter 4, train loss 571.38 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.23 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.09 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/7 )



iter 7, train loss 357.08 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.46 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.65 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:0.005


iter 0, train loss 12668.91 train acc: 0.200


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.31/1 )



iter 1, train loss 1265.31 train acc: 0.261


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/2 )



iter 2, train loss 843.59 train acc: 0.263


val acc 0.263 val loss 20.3 ( 657/2495. over: 0.999/3 )



iter 3, train loss 633.75 train acc: 0.263


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.0/4 )



iter 4, train loss 8449.65 train acc: 0.203


val acc 0.264 val loss 27.8 ( 659/2495. over: 1.3/0 )



iter 5, train loss 476.01 train acc: 0.240


val acc 0.263 val loss 21.0 ( 657/2495. over: 1.1/1 )



iter 6, train loss 370.62 train acc: 0.246


val acc 0.264 val loss 21.1 ( 658/2495. over: 1.07/2 )



iter 7, train loss 321.05 train acc: 0.250


val acc 0.261 val loss 20.8 ( 651/2495. over: 1.04/3 )



iter 8, train loss 282.19 train acc: 0.260


val acc 0.261 val loss 21.1 ( 652/2495. over: 1.0/4 )



iter 9, train loss 253.61 train acc: 0.262


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/5 )



iter 10, train loss 3620.58 train acc: 0.204


val acc 0.263 val loss 32.3 ( 655/2495. over: 1.29/6 )



iter 11, train loss 224.33 train acc: 0.248


val acc 0.259 val loss 20.9 ( 646/2495. over: 1.04/7 )



iter 12, train loss 198.49 train acc: 0.249


val acc 0.263 val loss 20.7 ( 657/2495. over: 1.06/8 )



iter 13, train loss 182.38 train acc: 0.254


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.04/9 )



iter 14, train loss 168.96 train acc: 0.263


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.0/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:5e-05


iter 0, train loss 46652.60 train acc: 0.149


val acc 0.190 val loss 70.1 ( 473/2495. over: 1.28/1 )



iter 1, train loss 3280.12 train acc: 0.171


val acc 0.258 val loss 23.5 ( 644/2495. over: 1.51/2 )



iter 2, train loss 1106.67 train acc: 0.186


val acc 0.142 val loss 26.6 ( 355/2495. over: 0.765/3 )



iter 3, train loss 10962.55 train acc: 0.159


val acc 0.179 val loss 3.14e+02 ( 446/2495. over: 1.12/4 )



iter 4, train loss 4680.55 train acc: 0.156


val acc 0.149 val loss 33.2 ( 373/2495. over: 0.958/5 )



iter 5, train loss 726.54 train acc: 0.172


val acc 0.240 val loss 45.2 ( 598/2495. over: 1.39/6 )



iter 6, train loss 914.82 train acc: 0.162


val acc 0.075 val loss 75.5 ( 188/2495. over: 0.465/7 )



iter 7, train loss 6368.24 train acc: 0.156


val acc 0.198 val loss 72.8 ( 493/2495. over: 1.26/8 )



iter 8, train loss 755.38 train acc: 0.162


val acc 0.253 val loss 26.0 ( 630/2495. over: 1.56/9 )



iter 9, train loss 495.26 train acc: 0.167


val acc 0.255 val loss 27.6 ( 635/2495. over: 1.53/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.1 l2:0.5


iter 0, train loss 2903.65 train acc: 0.255


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.41 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.32 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/3 )



iter 3, train loss 713.97 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.36 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.07 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.03 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.06 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.39 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.70 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.1 l2:0.005


iter 0, train loss 2301.84 train acc: 0.437


val acc 0.207 val loss 25.6 ( 516/2495. over: 0.474/1 )



iter 1, train loss 910.56 train acc: 0.535


val acc 0.204 val loss 26.3 ( 508/2495. over: 0.381/2 )



iter 2, train loss 553.62 train acc: 0.578


val acc 0.189 val loss 29.2 ( 471/2495. over: 0.326/3 )



iter 3, train loss 384.97 train acc: 0.612


val acc 0.179 val loss 29.9 ( 446/2495. over: 0.292/4 )



iter 4, train loss 290.82 train acc: 0.641


val acc 0.177 val loss 30.7 ( 442/2495. over: 0.276/5 )



iter 5, train loss 230.71 train acc: 0.659


val acc 0.178 val loss 29.3 ( 445/2495. over: 0.271/6 )



iter 6, train loss 192.45 train acc: 0.670


val acc 0.175 val loss 32.9 ( 437/2495. over: 0.262/7 )



iter 7, train loss 164.31 train acc: 0.679


val acc 0.179 val loss 32.5 ( 447/2495. over: 0.264/8 )



iter 8, train loss 142.00 train acc: 0.689


val acc 0.173 val loss 33.8 ( 432/2495. over: 0.251/9 )



iter 9, train loss 126.12 train acc: 0.697


val acc 0.174 val loss 33.4 ( 435/2495. over: 0.25/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.1 l2:5e-05


iter 0, train loss 2328.77 train acc: 0.467


val acc 0.186 val loss 38.6 ( 464/2495. over: 0.398/1 )



iter 1, train loss 798.57 train acc: 0.634


val acc 0.181 val loss 34.9 ( 451/2495. over: 0.285/2 )



iter 2, train loss 434.03 train acc: 0.702


val acc 0.192 val loss 33.5 ( 478/2495. over: 0.273/3 )



iter 3, train loss 286.35 train acc: 0.735


val acc 0.191 val loss 31.9 ( 476/2495. over: 0.26/4 )



iter 4, train loss 208.43 train acc: 0.754


val acc 0.184 val loss 40.8 ( 459/2495. over: 0.244/5 )



iter 5, train loss 164.40 train acc: 0.763


val acc 0.177 val loss 38.5 ( 441/2495. over: 0.232/6 )



iter 6, train loss 128.24 train acc: 0.783


val acc 0.194 val loss 37.2 ( 483/2495. over: 0.247/7 )



iter 7, train loss 105.55 train acc: 0.794


val acc 0.181 val loss 41.9 ( 451/2495. over: 0.228/8 )



iter 8, train loss 90.27 train acc: 0.805


val acc 0.197 val loss 33.0 ( 491/2495. over: 0.244/9 )



iter 9, train loss 76.37 train acc: 0.812


val acc 0.171 val loss 46.6 ( 426/2495. over: 0.21/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.3 l2:0.5


iter 0, train loss 2901.20 train acc: 0.259


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.02/1 )



iter 1, train loss 1428.19 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.09 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.24 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.34 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.08 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.00 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.08 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.45 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.64 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.3 l2:0.005


iter 0, train loss 2406.18 train acc: 0.405


val acc 0.199 val loss 25.4 ( 496/2495. over: 0.491/1 )



iter 1, train loss 955.17 train acc: 0.499


val acc 0.214 val loss 24.8 ( 535/2495. over: 0.43/2 )



iter 2, train loss 588.59 train acc: 0.548


val acc 0.180 val loss 28.3 ( 449/2495. over: 0.328/3 )



iter 3, train loss 418.34 train acc: 0.574


val acc 0.186 val loss 29.6 ( 464/2495. over: 0.324/4 )



iter 4, train loss 318.10 train acc: 0.601


val acc 0.184 val loss 28.0 ( 458/2495. over: 0.306/5 )



iter 5, train loss 256.20 train acc: 0.616


val acc 0.179 val loss 31.4 ( 446/2495. over: 0.29/6 )



iter 6, train loss 214.74 train acc: 0.626


val acc 0.182 val loss 28.0 ( 454/2495. over: 0.291/7 )



iter 7, train loss 182.91 train acc: 0.639


val acc 0.183 val loss 30.4 ( 457/2495. over: 0.287/8 )



iter 8, train loss 161.83 train acc: 0.635


val acc 0.187 val loss 31.8 ( 467/2495. over: 0.295/9 )



iter 9, train loss 143.47 train acc: 0.646


val acc 0.172 val loss 32.4 ( 430/2495. over: 0.267/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.3 l2:5e-05


iter 0, train loss 2529.05 train acc: 0.420


val acc 0.193 val loss 38.3 ( 482/2495. over: 0.461/1 )



iter 1, train loss 981.82 train acc: 0.561


val acc 0.200 val loss 35.2 ( 500/2495. over: 0.357/2 )



iter 2, train loss 571.94 train acc: 0.621


val acc 0.200 val loss 33.2 ( 499/2495. over: 0.322/3 )



iter 3, train loss 406.02 train acc: 0.642


val acc 0.188 val loss 33.7 ( 469/2495. over: 0.293/4 )



iter 4, train loss 306.27 train acc: 0.661


val acc 0.191 val loss 32.4 ( 477/2495. over: 0.289/5 )



iter 5, train loss 248.54 train acc: 0.669


val acc 0.180 val loss 42.0 ( 450/2495. over: 0.27/6 )



iter 6, train loss 204.11 train acc: 0.683


val acc 0.196 val loss 39.9 ( 489/2495. over: 0.287/7 )



iter 7, train loss 171.70 train acc: 0.693


val acc 0.193 val loss 43.7 ( 482/2495. over: 0.279/8 )



iter 8, train loss 146.89 train acc: 0.699


val acc 0.192 val loss 39.7 ( 480/2495. over: 0.275/9 )



iter 9, train loss 127.29 train acc: 0.711


val acc 0.200 val loss 37.7 ( 499/2495. over: 0.281/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.5 l2:0.5


iter 0, train loss 2909.42 train acc: 0.256


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.31 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.11 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.11 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.26 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.12 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.10 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.07 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.40 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.65 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.5 l2:0.005


iter 0, train loss 2453.14 train acc: 0.383


val acc 0.208 val loss 24.5 ( 519/2495. over: 0.543/1 )



iter 1, train loss 1042.54 train acc: 0.431


val acc 0.222 val loss 24.3 ( 555/2495. over: 0.516/2 )



iter 2, train loss 664.17 train acc: 0.469


val acc 0.198 val loss 25.1 ( 493/2495. over: 0.421/3 )



iter 3, train loss 476.67 train acc: 0.495


val acc 0.194 val loss 26.6 ( 484/2495. over: 0.392/4 )



iter 4, train loss 367.76 train acc: 0.519


val acc 0.201 val loss 25.0 ( 501/2495. over: 0.387/5 )



iter 5, train loss 293.39 train acc: 0.542


val acc 0.221 val loss 26.6 ( 552/2495. over: 0.408/6 )



iter 6, train loss 246.71 train acc: 0.559


val acc 0.200 val loss 25.8 ( 499/2495. over: 0.358/7 )



iter 7, train loss 210.90 train acc: 0.569


val acc 0.178 val loss 29.8 ( 445/2495. over: 0.313/8 )



iter 8, train loss 186.18 train acc: 0.577


val acc 0.170 val loss 27.2 ( 424/2495. over: 0.294/9 )



iter 9, train loss 165.37 train acc: 0.580


val acc 0.187 val loss 29.4 ( 467/2495. over: 0.322/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.5 l2:5e-05


iter 0, train loss 2647.15 train acc: 0.375


val acc 0.210 val loss 25.2 ( 525/2495. over: 0.561/1 )



iter 1, train loss 1156.90 train acc: 0.477


val acc 0.222 val loss 25.2 ( 553/2495. over: 0.464/2 )



iter 2, train loss 735.93 train acc: 0.513


val acc 0.210 val loss 35.7 ( 524/2495. over: 0.41/3 )



iter 3, train loss 533.38 train acc: 0.533


val acc 0.210 val loss 30.1 ( 523/2495. over: 0.393/4 )



iter 4, train loss 409.55 train acc: 0.550


val acc 0.214 val loss 29.8 ( 535/2495. over: 0.39/5 )



iter 5, train loss 330.82 train acc: 0.563


val acc 0.220 val loss 30.5 ( 549/2495. over: 0.391/6 )



iter 6, train loss 272.67 train acc: 0.573


val acc 0.192 val loss 31.4 ( 478/2495. over: 0.334/7 )



iter 7, train loss 233.09 train acc: 0.576


val acc 0.194 val loss 30.7 ( 484/2495. over: 0.337/8 )



iter 8, train loss 204.19 train acc: 0.580


val acc 0.204 val loss 36.4 ( 508/2495. over: 0.351/9 )



iter 9, train loss 178.11 train acc: 0.587


val acc 0.186 val loss 31.8 ( 465/2495. over: 0.317/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.7 l2:0.5


iter 0, train loss 2918.30 train acc: 0.255


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.37 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/2 )



iter 2, train loss 952.17 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.06 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/4 )



iter 4, train loss 571.35 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.13 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.06 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.08 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.41 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.61 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.7 l2:0.005


iter 0, train loss 2643.73 train acc: 0.305


val acc 0.246 val loss 21.5 ( 615/2495. over: 0.807/1 )



iter 1, train loss 1154.56 train acc: 0.341


val acc 0.242 val loss 21.9 ( 605/2495. over: 0.711/2 )



iter 2, train loss 777.99 train acc: 0.325


val acc 0.250 val loss 20.8 ( 623/2495. over: 0.767/3 )



iter 3, train loss 583.53 train acc: 0.327


val acc 0.216 val loss 21.0 ( 538/2495. over: 0.66/4 )



iter 4, train loss 468.05 train acc: 0.324


val acc 0.242 val loss 21.4 ( 603/2495. over: 0.746/5 )



iter 5, train loss 391.12 train acc: 0.322


val acc 0.237 val loss 20.9 ( 592/2495. over: 0.738/6 )



iter 6, train loss 333.20 train acc: 0.329


val acc 0.212 val loss 21.1 ( 528/2495. over: 0.643/7 )



iter 7, train loss 289.03 train acc: 0.335


val acc 0.244 val loss 20.8 ( 610/2495. over: 0.729/8 )



iter 8, train loss 259.93 train acc: 0.324


val acc 0.204 val loss 21.3 ( 509/2495. over: 0.63/9 )



iter 9, train loss 232.70 train acc: 0.331


val acc 0.250 val loss 21.1 ( 623/2495. over: 0.755/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.7 l2:5e-05


iter 0, train loss 2969.91 train acc: 0.266


val acc 0.206 val loss 22.5 ( 514/2495. over: 0.773/1 )



iter 1, train loss 1507.00 train acc: 0.269


val acc 0.262 val loss 21.4 ( 654/2495. over: 0.975/2 )



iter 2, train loss 1030.83 train acc: 0.250


val acc 0.255 val loss 21.9 ( 635/2495. over: 1.02/3 )



iter 3, train loss 777.99 train acc: 0.246


val acc 0.250 val loss 22.0 ( 623/2495. over: 1.02/4 )



iter 4, train loss 618.35 train acc: 0.241


val acc 0.206 val loss 21.6 ( 515/2495. over: 0.857/5 )



iter 5, train loss 506.46 train acc: 0.246


val acc 0.251 val loss 21.3 ( 626/2495. over: 1.02/6 )



iter 6, train loss 423.84 train acc: 0.253


val acc 0.252 val loss 22.1 ( 628/2495. over: 0.995/7 )



iter 7, train loss 363.73 train acc: 0.258


val acc 0.257 val loss 21.8 ( 642/2495. over: 0.999/8 )



iter 8, train loss 313.73 train acc: 0.266


val acc 0.249 val loss 21.9 ( 621/2495. over: 0.937/9 )



iter 9, train loss 272.88 train acc: 0.277


val acc 0.231 val loss 21.9 ( 577/2495. over: 0.836/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.1 l2:0.5


iter 0, train loss 2783.04 train acc: 0.301


val acc 0.196 val loss 25.5 ( 490/2495. over: 0.652/1 )



iter 1, train loss 1374.44 train acc: 0.302


val acc 0.195 val loss 24.0 ( 486/2495. over: 0.644/2 )



iter 2, train loss 906.46 train acc: 0.321


val acc 0.218 val loss 23.7 ( 543/2495. over: 0.677/3 )



iter 3, train loss 685.44 train acc: 0.319


val acc 0.206 val loss 23.2 ( 514/2495. over: 0.646/4 )



iter 4, train loss 557.71 train acc: 0.302


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.872/5 )



iter 5, train loss 477.09 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.59 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.29 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.50 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.65 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.1 l2:0.005


iter 0, train loss 2080.19 train acc: 0.498


val acc 0.182 val loss 29.7 ( 454/2495. over: 0.365/1 )



iter 1, train loss 645.77 train acc: 0.689


val acc 0.168 val loss 34.4 ( 420/2495. over: 0.244/2 )



iter 2, train loss 363.57 train acc: 0.730


val acc 0.166 val loss 40.4 ( 413/2495. over: 0.227/3 )



iter 3, train loss 250.91 train acc: 0.752


val acc 0.163 val loss 40.7 ( 407/2495. over: 0.217/4 )



iter 4, train loss 184.63 train acc: 0.776


val acc 0.163 val loss 41.9 ( 407/2495. over: 0.21/5 )



iter 5, train loss 136.46 train acc: 0.801


val acc 0.179 val loss 42.7 ( 447/2495. over: 0.224/6 )



iter 6, train loss 103.59 train acc: 0.822


val acc 0.164 val loss 45.7 ( 410/2495. over: 0.2/7 )



iter 7, train loss 81.30 train acc: 0.844


val acc 0.165 val loss 44.2 ( 411/2495. over: 0.195/8 )



iter 8, train loss 62.09 train acc: 0.868


val acc 0.170 val loss 46.3 ( 423/2495. over: 0.195/9 )



iter 9, train loss 52.37 train acc: 0.873


val acc 0.164 val loss 46.6 ( 409/2495. over: 0.188/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.1 l2:5e-05


iter 0, train loss 2072.40 train acc: 0.509


val acc 0.166 val loss 35.6 ( 415/2495. over: 0.327/1 )



iter 1, train loss 510.24 train acc: 0.759


val acc 0.168 val loss 43.8 ( 419/2495. over: 0.221/2 )



iter 2, train loss 220.35 train acc: 0.846


val acc 0.166 val loss 55.9 ( 414/2495. over: 0.196/3 )



iter 3, train loss 122.51 train acc: 0.886


val acc 0.156 val loss 48.8 ( 390/2495. over: 0.177/4 )



iter 4, train loss 86.36 train acc: 0.899


val acc 0.167 val loss 60.5 ( 416/2495. over: 0.185/5 )



iter 5, train loss 58.04 train acc: 0.920


val acc 0.168 val loss 66.1 ( 420/2495. over: 0.183/6 )



iter 6, train loss 45.02 train acc: 0.927


val acc 0.160 val loss 73.4 ( 400/2495. over: 0.173/7 )



iter 7, train loss 36.80 train acc: 0.931


val acc 0.174 val loss 68.5 ( 435/2495. over: 0.187/8 )



iter 8, train loss 30.28 train acc: 0.937


val acc 0.167 val loss 75.1 ( 417/2495. over: 0.178/9 )



iter 9, train loss 24.58 train acc: 0.944


val acc 0.166 val loss 71.3 ( 415/2495. over: 0.176/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.3 l2:0.5


iter 0, train loss 2786.08 train acc: 0.296


val acc 0.166 val loss 24.7 ( 414/2495. over: 0.561/1 )



iter 1, train loss 1376.97 train acc: 0.305


val acc 0.195 val loss 23.9 ( 487/2495. over: 0.639/2 )



iter 2, train loss 921.01 train acc: 0.312


val acc 0.213 val loss 24.0 ( 532/2495. over: 0.683/3 )



iter 3, train loss 693.00 train acc: 0.307


val acc 0.263 val loss 23.1 ( 657/2495. over: 0.858/4 )



iter 4, train loss 571.56 train acc: 0.266


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.989/5 )



iter 5, train loss 476.77 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.38 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.21 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.43 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.68 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.3 l2:0.005


iter 0, train loss 2153.89 train acc: 0.485


val acc 0.181 val loss 32.5 ( 452/2495. over: 0.374/1 )



iter 1, train loss 704.55 train acc: 0.659


val acc 0.172 val loss 34.1 ( 429/2495. over: 0.261/2 )



iter 2, train loss 397.74 train acc: 0.711


val acc 0.172 val loss 35.7 ( 429/2495. over: 0.242/3 )



iter 3, train loss 279.98 train acc: 0.724


val acc 0.169 val loss 36.9 ( 421/2495. over: 0.233/4 )



iter 4, train loss 210.74 train acc: 0.739


val acc 0.172 val loss 37.6 ( 429/2495. over: 0.233/5 )



iter 5, train loss 170.83 train acc: 0.743


val acc 0.165 val loss 39.2 ( 412/2495. over: 0.222/6 )



iter 6, train loss 132.46 train acc: 0.770


val acc 0.160 val loss 39.4 ( 400/2495. over: 0.208/7 )



iter 7, train loss 106.21 train acc: 0.792


val acc 0.168 val loss 43.1 ( 420/2495. over: 0.213/8 )



iter 8, train loss 85.81 train acc: 0.812


val acc 0.174 val loss 42.6 ( 434/2495. over: 0.214/9 )



iter 9, train loss 70.91 train acc: 0.825


val acc 0.170 val loss 42.3 ( 424/2495. over: 0.206/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.3 l2:5e-05


iter 0, train loss 2145.44 train acc: 0.490


val acc 0.183 val loss 30.0 ( 457/2495. over: 0.374/1 )



iter 1, train loss 593.23 train acc: 0.717


val acc 0.176 val loss 36.3 ( 438/2495. over: 0.245/2 )



iter 2, train loss 281.42 train acc: 0.802


val acc 0.174 val loss 45.8 ( 435/2495. over: 0.217/3 )



iter 3, train loss 168.69 train acc: 0.840


val acc 0.170 val loss 50.9 ( 424/2495. over: 0.202/4 )



iter 4, train loss 115.69 train acc: 0.862


val acc 0.167 val loss 53.3 ( 417/2495. over: 0.194/5 )



iter 5, train loss 82.71 train acc: 0.881


val acc 0.169 val loss 48.7 ( 422/2495. over: 0.192/6 )



iter 6, train loss 63.07 train acc: 0.894


val acc 0.173 val loss 66.8 ( 432/2495. over: 0.194/7 )



iter 7, train loss 51.54 train acc: 0.905


val acc 0.164 val loss 63.5 ( 408/2495. over: 0.181/8 )



iter 8, train loss 42.73 train acc: 0.909


val acc 0.164 val loss 71.3 ( 409/2495. over: 0.18/9 )



iter 9, train loss 34.50 train acc: 0.918


val acc 0.181 val loss 70.2 ( 451/2495. over: 0.197/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.5 l2:0.5


iter 0, train loss 2827.43 train acc: 0.275


val acc 0.159 val loss 23.8 ( 397/2495. over: 0.578/1 )



iter 1, train loss 1404.28 train acc: 0.284


val acc 0.222 val loss 23.2 ( 553/2495. over: 0.78/2 )



iter 2, train loss 933.68 train acc: 0.294


val acc 0.263 val loss 23.1 ( 657/2495. over: 0.895/3 )



iter 3, train loss 716.62 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/4 )



iter 4, train loss 572.40 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.52 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.26 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.19 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.42 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.69 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.5 l2:0.005


iter 0, train loss 2284.50 train acc: 0.452


val acc 0.162 val loss 28.7 ( 405/2495. over: 0.359/1 )



iter 1, train loss 781.40 train acc: 0.620


val acc 0.175 val loss 32.7 ( 436/2495. over: 0.282/2 )



iter 2, train loss 453.26 train acc: 0.670


val acc 0.180 val loss 32.1 ( 449/2495. over: 0.268/3 )



iter 3, train loss 321.78 train acc: 0.681


val acc 0.167 val loss 36.1 ( 417/2495. over: 0.245/4 )



iter 4, train loss 250.66 train acc: 0.687


val acc 0.164 val loss 36.7 ( 409/2495. over: 0.239/5 )



iter 5, train loss 200.54 train acc: 0.696


val acc 0.166 val loss 39.4 ( 414/2495. over: 0.239/6 )



iter 6, train loss 167.67 train acc: 0.708


val acc 0.179 val loss 33.3 ( 447/2495. over: 0.253/7 )



iter 7, train loss 140.15 train acc: 0.717


val acc 0.161 val loss 38.5 ( 401/2495. over: 0.224/8 )



iter 8, train loss 119.26 train acc: 0.733


val acc 0.165 val loss 35.0 ( 411/2495. over: 0.225/9 )



iter 9, train loss 103.46 train acc: 0.740


val acc 0.172 val loss 35.6 ( 430/2495. over: 0.233/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.5 l2:5e-05


iter 0, train loss 2278.96 train acc: 0.454


val acc 0.174 val loss 29.3 ( 433/2495. over: 0.382/1 )



iter 1, train loss 713.66 train acc: 0.656


val acc 0.180 val loss 32.6 ( 449/2495. over: 0.274/2 )



iter 2, train loss 364.07 train acc: 0.739


val acc 0.163 val loss 40.9 ( 407/2495. over: 0.221/3 )



iter 3, train loss 234.82 train acc: 0.776


val acc 0.167 val loss 39.1 ( 416/2495. over: 0.215/4 )



iter 4, train loss 165.76 train acc: 0.804


val acc 0.174 val loss 37.0 ( 435/2495. over: 0.217/5 )



iter 5, train loss 121.61 train acc: 0.825


val acc 0.187 val loss 34.6 ( 467/2495. over: 0.227/6 )



iter 6, train loss 98.05 train acc: 0.832


val acc 0.179 val loss 47.1 ( 446/2495. over: 0.215/7 )



iter 7, train loss 79.87 train acc: 0.843


val acc 0.175 val loss 41.0 ( 436/2495. over: 0.207/8 )



iter 8, train loss 67.55 train acc: 0.855


val acc 0.173 val loss 51.7 ( 431/2495. over: 0.202/9 )



iter 9, train loss 55.65 train acc: 0.865


val acc 0.166 val loss 56.7 ( 414/2495. over: 0.192/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.7 l2:0.5


iter 0, train loss 2907.48 train acc: 0.243


val acc 0.207 val loss 23.4 ( 516/2495. over: 0.852/1 )



iter 1, train loss 1448.21 train acc: 0.256


val acc 0.263 val loss 23.2 ( 657/2495. over: 1.03/2 )



iter 2, train loss 959.13 train acc: 0.264


val acc 0.263 val loss 23.1 ( 657/2495. over: 0.999/3 )



iter 3, train loss 716.80 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/4 )



iter 4, train loss 572.41 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/5 )



iter 5, train loss 476.50 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/6 )



iter 6, train loss 408.29 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/7 )



iter 7, train loss 357.12 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/8 )



iter 8, train loss 317.46 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/9 )



iter 9, train loss 285.68 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.999/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.7 l2:0.005


iter 0, train loss 2651.56 train acc: 0.359


val acc 0.200 val loss 23.0 ( 500/2495. over: 0.559/1 )



iter 1, train loss 997.43 train acc: 0.507


val acc 0.195 val loss 23.1 ( 486/2495. over: 0.384/2 )



iter 2, train loss 577.38 train acc: 0.567


val acc 0.186 val loss 26.6 ( 463/2495. over: 0.327/3 )



iter 3, train loss 404.25 train acc: 0.594


val acc 0.186 val loss 26.7 ( 464/2495. over: 0.313/4 )



iter 4, train loss 310.29 train acc: 0.610


val acc 0.183 val loss 26.1 ( 456/2495. over: 0.3/5 )



iter 5, train loss 250.96 train acc: 0.617


val acc 0.184 val loss 26.0 ( 460/2495. over: 0.299/6 )



iter 6, train loss 214.58 train acc: 0.621


val acc 0.192 val loss 26.9 ( 480/2495. over: 0.31/7 )



iter 7, train loss 184.21 train acc: 0.625


val acc 0.183 val loss 29.0 ( 457/2495. over: 0.293/8 )



iter 8, train loss 161.81 train acc: 0.628


val acc 0.183 val loss 29.7 ( 457/2495. over: 0.292/9 )



iter 9, train loss 142.08 train acc: 0.639


val acc 0.174 val loss 29.4 ( 433/2495. over: 0.272/9 )
BEST val acc 0.264
####### lr:0.0001 drop:0.7 l2:5e-05


iter 0, train loss 2708.67 train acc: 0.350


val acc 0.225 val loss 22.1 ( 561/2495. over: 0.643/1 )



iter 1, train loss 1004.34 train acc: 0.509


val acc 0.219 val loss 22.5 ( 547/2495. over: 0.431/2 )



iter 2, train loss 570.64 train acc: 0.584


val acc 0.218 val loss 22.6 ( 545/2495. over: 0.374/3 )



iter 3, train loss 373.36 train acc: 0.633


val acc 0.199 val loss 26.3 ( 497/2495. over: 0.315/4 )



iter 4, train loss 274.47 train acc: 0.664


val acc 0.196 val loss 27.7 ( 489/2495. over: 0.295/5 )



iter 5, train loss 215.97 train acc: 0.679


val acc 0.184 val loss 29.1 ( 460/2495. over: 0.272/6 )



iter 6, train loss 172.94 train acc: 0.701


val acc 0.202 val loss 27.2 ( 503/2495. over: 0.287/7 )



iter 7, train loss 144.63 train acc: 0.715


val acc 0.192 val loss 30.2 ( 480/2495. over: 0.269/8 )



iter 8, train loss 123.77 train acc: 0.725


val acc 0.184 val loss 30.8 ( 459/2495. over: 0.254/9 )



iter 9, train loss 108.97 train acc: 0.729


val acc 0.194 val loss 28.9 ( 484/2495. over: 0.266/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.1 l2:0.5


iter 0, train loss 3541.11 train acc: 0.255


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.72 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.998/2 )



iter 2, train loss 952.30 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.999/3 )



iter 3, train loss 714.01 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.996/4 )



iter 4, train loss 571.41 train acc: 0.263


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/5 )



iter 5, train loss 475.92 train acc: 0.265


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.992/6 )



iter 6, train loss 408.63 train acc: 0.259


val acc 0.263 val loss 23.1 ( 657/2495. over: 1.02/7 )



iter 7, train loss 357.09 train acc: 0.264


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.997/8 )



iter 8, train loss 317.26 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.996/9 )



iter 9, train loss 285.79 train acc: 0.261


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.1 l2:0.005


iter 0, train loss 6367.15 train acc: 0.235


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.12/1 )



iter 1, train loss 1263.27 train acc: 0.263


val acc 0.253 val loss 23.2 ( 632/2495. over: 0.965/2 )



iter 2, train loss 846.20 train acc: 0.260


val acc 0.263 val loss 20.3 ( 657/2495. over: 1.01/3 )



iter 3, train loss 636.61 train acc: 0.260


val acc 0.263 val loss 20.6 ( 657/2495. over: 1.01/4 )



iter 4, train loss 1713.25 train acc: 0.218


val acc 0.263 val loss 20.6 ( 655/2495. over: 1.21/5 )



iter 5, train loss 422.75 train acc: 0.264


val acc 0.263 val loss 20.3 ( 657/2495. over: 0.997/6 )



iter 6, train loss 362.05 train acc: 0.256


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.03/7 )



iter 7, train loss 315.85 train acc: 0.263


val acc 0.263 val loss 20.3 ( 657/2495. over: 1.0/8 )



iter 8, train loss 281.79 train acc: 0.262


val acc 0.264 val loss 20.5 ( 659/2495. over: 1.01/9 )



iter 9, train loss 1178.66 train acc: 0.234


val acc 0.260 val loss 21.0 ( 649/2495. over: 1.11/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.1 l2:5e-05


iter 0, train loss 8956.16 train acc: 0.197


val acc 0.219 val loss 24.8 ( 547/2495. over: 1.11/1 )



iter 1, train loss 1261.29 train acc: 0.269


val acc 0.226 val loss 21.7 ( 565/2495. over: 0.843/2 )



iter 2, train loss 821.99 train acc: 0.284


val acc 0.255 val loss 21.8 ( 636/2495. over: 0.897/3 )



iter 3, train loss 677.76 train acc: 0.243


val acc 0.262 val loss 29.2 ( 654/2495. over: 1.08/4 )



iter 4, train loss 2734.67 train acc: 0.222


val acc 0.255 val loss 21.4 ( 636/2495. over: 1.15/5 )



iter 5, train loss 415.07 train acc: 0.275


val acc 0.262 val loss 21.2 ( 653/2495. over: 0.952/6 )



iter 6, train loss 351.05 train acc: 0.281


val acc 0.246 val loss 21.3 ( 615/2495. over: 0.878/7 )



iter 7, train loss 315.31 train acc: 0.261


val acc 0.237 val loss 27.6 ( 591/2495. over: 0.908/8 )



iter 8, train loss 1620.62 train acc: 0.223


val acc 0.220 val loss 22.8 ( 548/2495. over: 0.985/9 )



iter 9, train loss 251.00 train acc: 0.267


val acc 0.198 val loss 21.1 ( 493/2495. over: 0.739/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.3 l2:0.5


iter 0, train loss 3320.19 train acc: 0.257


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.02/1 )



iter 1, train loss 1428.88 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/2 )



iter 2, train loss 952.14 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/3 )



iter 3, train loss 714.14 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.996/4 )



iter 4, train loss 572.05 train acc: 0.258


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.02/5 )



iter 5, train loss 476.01 train acc: 0.267


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.984/6 )



iter 6, train loss 408.24 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/7 )



iter 7, train loss 356.61 train acc: 0.270


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.975/8 )



iter 8, train loss 317.27 train acc: 0.267


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.986/9 )



iter 9, train loss 285.70 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.3 l2:0.005


iter 0, train loss 6587.94 train acc: 0.231


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.14/1 )



iter 1, train loss 1264.25 train acc: 0.260


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/2 )



iter 2, train loss 844.64 train acc: 0.264


val acc 0.259 val loss 21.0 ( 647/2495. over: 0.983/3 )



iter 3, train loss 639.52 train acc: 0.254


val acc 0.240 val loss 24.2 ( 598/2495. over: 0.945/4 )



iter 4, train loss 3211.76 train acc: 0.218


val acc 0.263 val loss 21.3 ( 655/2495. over: 1.2/5 )



iter 5, train loss 423.33 train acc: 0.263


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.0/6 )



iter 6, train loss 361.83 train acc: 0.264


val acc 0.263 val loss 20.3 ( 657/2495. over: 0.996/7 )



iter 7, train loss 316.06 train acc: 0.263


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.0/8 )



iter 8, train loss 281.98 train acc: 0.263


val acc 0.189 val loss 20.7 ( 471/2495. over: 0.718/9 )



iter 9, train loss 1387.45 train acc: 0.223


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.18/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.3 l2:5e-05


iter 0, train loss 13332.01 train acc: 0.176


val acc 0.198 val loss 24.7 ( 494/2495. over: 1.13/1 )



iter 1, train loss 1364.10 train acc: 0.232


val acc 0.131 val loss 23.5 ( 327/2495. over: 0.565/2 )



iter 2, train loss 863.83 train acc: 0.238


val acc 0.233 val loss 21.2 ( 582/2495. over: 0.982/3 )



iter 3, train loss 936.32 train acc: 0.222


val acc 0.144 val loss 6.81e+02 ( 359/2495. over: 0.647/4 )



iter 4, train loss 4520.51 train acc: 0.191


val acc 0.203 val loss 23.4 ( 507/2495. over: 1.07/5 )



iter 5, train loss 448.41 train acc: 0.231


val acc 0.259 val loss 21.2 ( 646/2495. over: 1.12/6 )



iter 6, train loss 365.34 train acc: 0.250


val acc 0.258 val loss 21.7 ( 644/2495. over: 1.03/7 )



iter 7, train loss 320.11 train acc: 0.250


val acc 0.175 val loss 22.9 ( 437/2495. over: 0.701/8 )



iter 8, train loss 3041.54 train acc: 0.190


val acc 0.264 val loss 28.6 ( 658/2495. over: 1.39/9 )



iter 9, train loss 295.16 train acc: 0.211


val acc 0.169 val loss 21.6 ( 421/2495. over: 0.801/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.5 l2:0.5


iter 0, train loss 3385.39 train acc: 0.252


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.04/1 )



iter 1, train loss 1428.51 train acc: 0.262


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.01/2 )



iter 2, train loss 951.85 train acc: 0.265


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.992/3 )



iter 3, train loss 713.72 train acc: 0.267


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.985/4 )



iter 4, train loss 571.46 train acc: 0.261


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/5 )



iter 5, train loss 476.07 train acc: 0.265


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.993/6 )



iter 6, train loss 408.21 train acc: 0.258


val acc 0.263 val loss 23.1 ( 657/2495. over: 1.02/7 )



iter 7, train loss 357.23 train acc: 0.265


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.993/8 )



iter 8, train loss 317.70 train acc: 0.262


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.01/9 )



iter 9, train loss 285.70 train acc: 0.260


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.5 l2:0.005


iter 0, train loss 7207.67 train acc: 0.222


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.18/1 )



iter 1, train loss 1269.51 train acc: 0.257


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.02/2 )



iter 2, train loss 844.94 train acc: 0.260


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.01/3 )



iter 3, train loss 5004.26 train acc: 0.240


val acc 0.255 val loss 6.38e+02 ( 635/2495. over: 1.06/4 )



iter 4, train loss 1029.51 train acc: 0.238


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.11/5 )



iter 5, train loss 423.92 train acc: 0.259


val acc 0.264 val loss 20.4 ( 658/2495. over: 1.02/6 )



iter 6, train loss 362.74 train acc: 0.257


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.02/7 )



iter 7, train loss 316.52 train acc: 0.264


val acc 0.263 val loss 20.3 ( 657/2495. over: 0.998/8 )



iter 8, train loss 1135.86 train acc: 0.245


val acc 0.191 val loss 2.13e+03 ( 477/2495. over: 0.779/9 )



iter 9, train loss 1314.01 train acc: 0.233


val acc 0.244 val loss 22.2 ( 609/2495. over: 1.05/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.5 l2:5e-05


iter 0, train loss 22235.45 train acc: 0.154


val acc 0.254 val loss 33.7 ( 634/2495. over: 1.65/1 )



iter 1, train loss 1541.29 train acc: 0.203


val acc 0.162 val loss 22.6 ( 405/2495. over: 0.799/2 )



iter 2, train loss 933.18 train acc: 0.213


val acc 0.254 val loss 21.7 ( 634/2495. over: 1.19/3 )



iter 3, train loss 1071.10 train acc: 0.196


val acc 0.082 val loss 1.45e+03 ( 205/2495. over: 0.419/4 )



iter 4, train loss 7684.93 train acc: 0.159


val acc 0.257 val loss 25.9 ( 641/2495. over: 1.61/5 )



iter 5, train loss 544.30 train acc: 0.194


val acc 0.260 val loss 22.2 ( 649/2495. over: 1.34/6 )



iter 6, train loss 395.77 train acc: 0.214


val acc 0.259 val loss 21.4 ( 647/2495. over: 1.21/7 )



iter 7, train loss 331.82 train acc: 0.221


val acc 0.256 val loss 21.4 ( 639/2495. over: 1.16/8 )



iter 8, train loss 4560.50 train acc: 0.177


val acc 0.241 val loss 92.5 ( 602/2495. over: 1.36/9 )



iter 9, train loss 581.39 train acc: 0.175


val acc 0.262 val loss 23.0 ( 653/2495. over: 1.5/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:0.5


iter 0, train loss 3262.26 train acc: 0.252


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.05/1 )



iter 1, train loss 1427.24 train acc: 0.268


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.983/2 )



iter 2, train loss 951.96 train acc: 0.264


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.996/3 )



iter 3, train loss 714.98 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.0/4 )



iter 4, train loss 570.74 train acc: 0.267


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.988/5 )



iter 5, train loss 475.97 train acc: 0.266


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.988/6 )



iter 6, train loss 408.19 train acc: 0.261


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/7 )



iter 7, train loss 357.35 train acc: 0.259


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.02/8 )



iter 8, train loss 317.52 train acc: 0.262


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.01/9 )



iter 9, train loss 286.01 train acc: 0.262


val acc 0.263 val loss 23.1 ( 657/2495. over: 1.0/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:0.005


iter 0, train loss 11804.25 train acc: 0.190


val acc 0.192 val loss 20.5 ( 479/2495. over: 1.01/1 )



iter 1, train loss 1270.62 train acc: 0.260


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/2 )



iter 2, train loss 845.96 train acc: 0.264


val acc 0.263 val loss 20.4 ( 657/2495. over: 0.999/3 )



iter 3, train loss 636.95 train acc: 0.260


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.01/4 )



iter 4, train loss 8135.86 train acc: 0.190


val acc 0.264 val loss 23.5 ( 658/2495. over: 1.39/5 )



iter 5, train loss 455.62 train acc: 0.245


val acc 0.205 val loss 20.9 ( 511/2495. over: 0.834/6 )



iter 6, train loss 371.81 train acc: 0.246


val acc 0.263 val loss 20.5 ( 657/2495. over: 1.07/7 )



iter 7, train loss 320.80 train acc: 0.252


val acc 0.263 val loss 20.4 ( 657/2495. over: 1.04/8 )



iter 8, train loss 282.09 train acc: 0.265


val acc 0.264 val loss 20.4 ( 658/2495. over: 0.997/9 )



iter 9, train loss 3379.00 train acc: 0.226


val acc 0.262 val loss 2.25e+02 ( 654/2495. over: 1.16/9 )
BEST val acc 0.264
####### lr:0.01 drop:0.7 l2:5e-05


iter 0, train loss 46773.45 train acc: 0.150


val acc 0.257 val loss 41.4 ( 642/2495. over: 1.71/1 )



iter 1, train loss 3074.51 train acc: 0.170


val acc 0.258 val loss 20.7 ( 644/2495. over: 1.52/2 )



iter 2, train loss 1111.15 train acc: 0.180


val acc 0.142 val loss 22.6 ( 354/2495. over: 0.788/3 )



iter 3, train loss 10942.88 train acc: 0.161


val acc 0.159 val loss 2.89e+02 ( 396/2495. over: 0.988/4 )



iter 4, train loss 5561.65 train acc: 0.161


val acc 0.195 val loss 29.0 ( 487/2495. over: 1.21/5 )



iter 5, train loss 709.84 train acc: 0.173


val acc 0.189 val loss 32.5 ( 472/2495. over: 1.09/6 )



iter 6, train loss 736.05 train acc: 0.165


val acc 0.143 val loss 79.6 ( 357/2495. over: 0.869/7 )



iter 7, train loss 7038.87 train acc: 0.161


val acc 0.263 val loss 1.34e+02 ( 657/2495. over: 1.64/8 )



iter 8, train loss 1105.14 train acc: 0.160


val acc 0.140 val loss 24.1 ( 350/2495. over: 0.875/9 )



iter 9, train loss 407.75 train acc: 0.171


val acc 0.149 val loss 37.2 ( 373/2495. over: 0.873/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.1 l2:0.5


iter 0, train loss 2903.61 train acc: 0.256


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.03/1 )



iter 1, train loss 1428.24 train acc: 0.263


val acc 0.263 val loss 23.0 ( 657/2495. over: 1.0/2 )



iter 2, train loss 952.29 train acc: 0.265


val acc 0.263 val loss 23.0 ( 657/2495. over: 0.993/3 )



iter 3, train loss 714.44 train acc: 0.265


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.992/4 )



iter 4, train loss 571.12 train acc: 0.265


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.993/5 )



iter 5, train loss 475.99 train acc: 0.267


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.985/6 )



iter 6, train loss 407.74 train acc: 0.266


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.989/7 )



iter 7, train loss 356.83 train acc: 0.268


val acc 0.263 val loss 22.9 ( 657/2495. over: 0.982/8 )



iter 8, train loss 317.49 train acc: 0.262


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/9 )



iter 9, train loss 285.74 train acc: 0.261


val acc 0.263 val loss 22.9 ( 657/2495. over: 1.01/9 )
BEST val acc 0.264
####### lr:0.001 drop:0.1 l2:0.005


iter 0, train loss 2693.13 train acc: 0.300


val acc 0.189 val loss 22.5 ( 472/2495. over: 0.63/1 )



iter 1, train loss 300.76 train acc: 0.345


KeyboardInterrupt: 

In [102]:
# hyper-params
PATH = 'best_param.pth'
n_epochs = 100
patience = 10
hidden_dim = 300
n_heads = 16
train_batch_size = 16
test_val_batch_size = 256

model = Classifier(len(graph_builder.vocab), hidden_dim, n_heads, dataset.nclass, drop=0.7).to(torch.device('cuda:0'))

# DEPOIS TESTAR COM FOCAL-LOSS https://github.com/mbsariyildiz/focal-loss.pytorch/blob/master/focalloss.py

#loss_func = FocalLoss().to(torch.device('cuda:0'))
#loss_func = nn.NLLLoss().to(torch.device('cuda:0'))

loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
loss_eval_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

#optimizer = optim.Adam( model.parameters(), lr=0.001, weight_decay=5e-3)

optimizer = optim.RMSprop( model.parameters(), lr=0.0001, weight_decay=5e-3 )
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()
epoch_losses = []

In [103]:
#data_loader_test = DataLoader(list(zip(Gs_test, fold.y_test)), batch_size=test_val_batch_size,
#                              shuffle=True, collate_fn=collate)
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)

In [104]:
best_score = None
n_iters = 0
qtd_noised = 3
noised = False

for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                             shuffle=True, collate_fn=collate)
    epoch_loss = 0
    with tqdm_notebook(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        total = 0
        correct = 0
        model.train()
        for i, (bg, label) in enumerate(data_loader):
            if noised and i % qtd_noised == 0:
                np.random.shuffle(label)
            outputs = model(bg)
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
            loss = loss_func(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            
            qtd_docs = label.size(0)
            qtd_correct = (sampled_Y == label).sum().item()
            total += qtd_docs
            correct += qtd_correct
            
            del loss, outputs, bg, probs_Y, sampled_Y
            pbar.update( len(label) )
            print('iter {}, train loss {:.2f} train acc: {:.3f}'.format(epoch, epoch_loss / (epoch + 1), (correct/total)), end='\r')
        score_train = correct/total
    with tqdm_notebook(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
        with torch.no_grad():
            total = 0
            correct = 0
            epoch_loss = 0
            model.eval()
            for bg, label in data_loader_val:
                outputs = model(bg)
                probs_Y = torch.softmax(outputs, 1)
                sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                #print(probs_Y.shape, sampled_Y.shape, label.shape)
                loss = loss_eval_func(outputs, label)
                epoch_loss += loss.item()
                
                qtd_docs = label.size(0)
                qtd_correct = (sampled_Y == label).sum().item()
                total += qtd_docs
                correct += qtd_correct
                
                del probs_Y, outputs, bg, sampled_Y
                pbar.update( label.size(0) )
                score_val = correct/total
                
                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
                #break
            #break
            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
            score = correct/total
            if best_score is None or score > best_score:
                torch.save(model, PATH)
                best_score = score
                n_iters = 0
            else:
                n_iters += 1
                if n_iters >= patience:
                    print()
                    print('BEST val acc {:.3f}'.format(best_score), end='\r')
                    break
            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
        epoch_loss /= (epoch + 1)
        epoch_losses.append(epoch_loss)
    print()

iter 0, train loss 2641.27 train acc: 0.372


val acc 0.193 val loss 24.8 ( 481/2495. over: 0.518/0 )

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


val acc 0.193 val loss 24.8 ( 481/2495. over: 0.518/0 )



iter 1, train loss 1013.45 train acc: 0.488


val acc 0.192 val loss 24.6 ( 478/2495. over: 0.393/1 )



iter 2, train loss 624.64 train acc: 0.518


val acc 0.189 val loss 27.3 ( 471/2495. over: 0.364/2 )



iter 3, train loss 72.79 train acc: 0.561


KeyboardInterrupt: 

In [80]:
model = torch.load(PATH)
with torch.no_grad():
    total = 0
    correct = 0
    epoch_loss = 0
    model.eval()
    for bg, label in data_loader_val:
        outputs = model(bg)
        probs_Y = torch.softmax(outputs, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        #print(probs_Y.shape, sampled_Y.shape, label.shape)
        loss = loss_eval_func(outputs, label)
        epoch_loss += loss.item()

        qtd_docs = label.size(0)
        qtd_correct = (sampled_Y == label).sum().item()
        total += qtd_docs
        correct += qtd_correct

        del probs_Y, outputs, bg, sampled_Y
        pbar.update( label.size(0) )
        score_val = correct/total

        print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
    #break
            

val acc 0.203 val loss 22.8 ( 507/2495. over: 0.274 )

In [81]:
print( len(Gs_train), '+', len(Gs_val), end=' = ' )
Gs_train_val = Gs_train + Gs_val
print( len(Gs_train_val) )

print( len(fold.y_train), '+', len(fold.y_val), end=' = ' )
y_train_val = fold.y_train + fold.y_val
print( len(y_train_val) )

19907 + 2495 = 22402
19907 + 2495 = 22402


In [82]:
def collate2(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [83]:
data_loader_train_val_t  = DataLoader(list(zip(Gs_train_val, y_train_val)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate2)
X_train_val_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_train_val_t.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_train_val_t:
            X_train_val_t = model.transform( G ).cpu().numpy()
            X_train_val_all.append( X_train_val_t )
            pbar.update( len(label) )





In [84]:
X_train_val_all2 = np.concatenate( X_train_val_all )
X_train_val_all2

array([[-0.2178266 , -0.09567951,  0.43155062, ...,  0.01292455,
        -0.22324216, -0.58197975],
       [-0.3348459 ,  0.01925122,  0.4095029 , ...,  0.29597798,
        -0.09173325, -0.61389023],
       [-0.2497353 ,  0.0744639 ,  0.2100017 , ...,  0.19944745,
        -0.12531002, -0.115013  ],
       ...,
       [-0.17952038, -0.10148983,  0.7563992 , ...,  0.02897497,
        -0.03622607, -0.26277977],
       [ 0.180725  , -0.06271119,  0.24564497, ...,  0.00429483,
         0.05862079, -0.2594659 ],
       [-0.21408391,  0.18337913,  0.23021354, ...,  0.3431807 ,
         0.30042037, -0.45386586]], dtype=float32)

In [85]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param = {'loss': 'squared_hinge', 'C': 1, 'verbose': 0,
         'intercept_scaling': 1, 'fit_intercept': True,
         'max_iter': 1000, 'penalty': 'l2', 'multi_class': 'ovr',
         'random_state': None, 'dual': False,'tol': 0.001,
         'class_weight': None}
estimator = LinearSVC(**param)
tunning = [{'C': 2.0 ** np.arange(-5, 9, 2)}]

gs = GridSearchCV(estimator, tunning,
                n_jobs=64, refit=False,
                cv=5, iid=True,
                verbose=2, scoring='f1_micro')

gs.fit( X_train_val_all2, y_train_val )

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=64)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  16 out of  35 | elapsed: 22.1min remaining: 26.2min
[Parallel(n_jobs=64)]: Done  35 out of  35 | elapsed: 28.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LinearSVC(C=1, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.001, verbose=0),
             iid=True, n_jobs=64,
             param_grid=[{'C': array([3.125e-02, 1.250e-01, 5.000e-01, 2.000e+00, 8.000e+00, 3.200e+01,
       1.280e+02])}],
             pre_dispatch='2*n_jobs', refit=False, return_train_score=False,
             scoring='f1_micro', verbose=2)

In [86]:
Gs_test  = graph_builder.transform(fold.X_test)
data_loader_test = DataLoader(list(zip(Gs_test, fold.y_test)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate)
X_test_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_test.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_test:
            X_test_t = model.transform( G ).cpu().numpy()
            X_test_all.append( X_test_t )
            pbar.update( len(label) )
X_test_all = np.concatenate( X_test_all )
X_test_all

100%|██████████| 2495/2495 [00:00<00:00, 3820.70it/s]
100%|██████████| 2495/2495 [00:07<00:00, 335.46it/s]
100%|██████████| 2495/2495 [00:01<00:00, 1676.05it/s]





array([[ 0.23167111,  0.21653058,  0.06046243, ..., -0.00840951,
         0.20930912, -0.13022415],
       [-0.21017554,  0.19031881,  0.38463926, ...,  0.29666758,
         0.2398574 , -0.41712025],
       [ 0.07892672, -0.12812844,  0.3588904 , ...,  0.20220874,
        -0.03211524, -0.2635097 ],
       ...,
       [-0.31367537, -0.36216024,  0.43182194, ...,  0.6409572 ,
        -0.09814946, -0.6874743 ],
       [ 0.01328409,  0.22750883,  0.10006183, ...,  0.164902  ,
         0.16365732, -0.52859443],
       [-0.39270324,  0.19427915, -0.05538194, ...,  0.2295712 ,
         0.31881106, -0.378664  ]], dtype=float32)

In [87]:
lsvm = LinearSVC( **gs.best_params_ )
lsvm.fit( X_train_val_all2, y_train_val )



LinearSVC(C=0.03125, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [88]:
y_pred = lsvm.predict( X_test_all )

In [89]:
sum(y_pred == fold.y_test)/len(y_pred)

0.6256513026052104

In [70]:
sum(y_pred == fold.y_test)/len(y_pred)

0.49218436873747495

In [None]:
sorted(Counter(list(map(int,list(sampled_Y)))).items())

In [None]:
sorted(Counter(list(map(int,list(label)))).items())

In [None]:
a = [1,2,3,4]
print(a)
print(a)