In [1]:
from utils import Dataset, Graphsize
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from time import time
import numpy as np

In [2]:
dataset = Dataset('/home/mangaravite/Documents/datasets/topics/acm/')

In [3]:
fold = next(dataset.get_fold_instances(10))
fold._fields

('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val')

In [5]:
%%time
max_feats = 250000
graph_builder = Graphsize(w=5, verbose=True, max_feat=max_feats)
Gs_train = graph_builder.fit_transform(fold.X_train)
Gs_val   = graph_builder.transform(fold.X_val)
#Gs_test  = graph_builder.transform(fold.X_test)

100%|██████████| 19907/19907 [00:13<00:00, 1449.35it/s]
100%|██████████| 19907/19907 [00:05<00:00, 3861.63it/s]
100%|██████████| 19907/19907 [01:06<00:00, 299.39it/s]
100%|██████████| 19907/19907 [00:12<00:00, 1577.99it/s]
100%|██████████| 2495/2495 [00:00<00:00, 3950.38it/s]
100%|██████████| 2495/2495 [00:07<00:00, 325.67it/s]
100%|██████████| 2495/2495 [00:01<00:00, 1630.88it/s]

CPU times: user 1min 47s, sys: 932 ms, total: 1min 48s
Wall time: 1min 47s





In [6]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

import torch.optim as optim
from torch.utils.data import DataLoader

In [7]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_heads, n_classes, drop=0.5, k=2):
        super(Classifier, self).__init__()

        self.layers = nn.ModuleList([
            #GraphConv(in_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            #GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))])
            GATConv(in_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0')),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=drop).to(torch.device('cuda:0'))
        ])
        self.lin = nn.Linear(n_heads*hidden_dim, 1).to(torch.device('cuda:0'))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( n_heads*hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.conv = nn.Linear(n_heads*hidden_dim, k*hidden_dim).to(torch.device('cuda:0'))
        
        self.classify = nn.Linear(k*hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.pooling(G, h)
        #G.ndata['h'] = h
        #w = self.lin( h )
        #G.ndata['w'] = w
        #hg = dgl.mean_nodes(G, 'h', weight='w')
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.conv( hg )
        pred = self.classify( hg )
        return pred
    def transform(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        hg = self.pooling(G, h)
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.conv(hg)
        return hg
    def predict(self, G):
        hg = self.transform( G )
        pred = self.classify( hg )
        probs_Y = torch.softmax(pred, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        return sampled_Y
        

In [8]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [None]:
class FocalLoss(nn.Module):
    # https://github.com/mbsariyildiz/focal-loss.pytorch
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input, dim=1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [None]:
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)
best_score = None
for (noised,qtd_noised) in [ (False, 0), (True, 3), (True, 5) ]:
    for lr in [ .01, .001, .0001 ]:
        for drop in [ .1, .3, .5, .7 ]:
            for l2 in [ 5e-1, 5e-3, 5e-5 ]:
                # hyper-params
                PATH = 'best_param.pth'
                n_epochs = 100
                patience = 10
                hidden_dim = 300
                n_heads = 16
                train_batch_size = 16
                test_val_batch_size = 256

                model = Classifier(len(graph_builder.vocab), hidden_dim, n_heads, dataset.nclass, drop=drop).to(torch.device('cuda:0'))

                loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
                loss_eval_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

                optimizer = optim.Adam( model.parameters(), lr=lr, weight_decay=l2)

                model.train()
                torch.cuda.synchronize()
                epoch_losses = []
                n_iters = 0
                
                for epoch in range(n_epochs):
                    print("lr:{.3} drop:{.3} l2:{.3}".format(lr, drop, l2))
                    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                                             shuffle=True, collate_fn=collate)
                    epoch_loss = 0
                    with tqdm_notebook(total=len(data_loader.dataset), smoothing=0.) as pbar:
                        t0 = time()
                        total = 0
                        correct = 0
                        model.train()
                        for i, (bg, label) in enumerate(data_loader):
                            if noised and i % qtd_noised == 0:
                                np.random.shuffle(label)
                            outputs = model(bg)
                            probs_Y = torch.softmax(outputs, 1)
                            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                            loss = loss_func(outputs, label)
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()
                            epoch_loss += loss.detach().item()

                            qtd_docs = label.size(0)
                            qtd_correct = (sampled_Y == label).sum().item()
                            total += qtd_docs
                            correct += qtd_correct

                            del loss, outputs, bg, probs_Y, sampled_Y
                            pbar.update( len(label) )
                            print('iter {}, train loss {:.2f} train acc: {:.3f}'.format(epoch, epoch_loss / (epoch + 1), (correct/total)), end='\r')
                        score_train = correct/total
                    with tqdm_notebook(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
                        with torch.no_grad():
                            total = 0
                            correct = 0
                            epoch_loss = 0
                            model.eval()
                            for bg, label in data_loader_val:
                                outputs = model(bg)
                                probs_Y = torch.softmax(outputs, 1)
                                sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                                #print(probs_Y.shape, sampled_Y.shape, label.shape)
                                loss = loss_eval_func(outputs, label)
                                epoch_loss += loss.item()

                                qtd_docs = label.size(0)
                                qtd_correct = (sampled_Y == label).sum().item()
                                total += qtd_docs
                                correct += qtd_correct

                                del probs_Y, outputs, bg, sampled_Y
                                pbar.update( label.size(0) )
                                score_val = correct/total

                                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
                                #break
                            #break
                            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                            score = correct/total
                            if best_score is None or score > best_score:
                                torch.save(model, PATH)
                                best_score = score
                                n_iters = 0
                            else:
                                n_iters += 1
                                if n_iters >= patience:
                                    print()
                                    print('BEST val acc {:.3f}'.format(best_score), end='\r')
                                    break
                            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
                        epoch_loss /= (epoch + 1)
                        epoch_losses.append(epoch_loss)
                    print()

In [102]:
# hyper-params
PATH = 'best_param.pth'
n_epochs = 100
patience = 10
hidden_dim = 300
n_heads = 16
train_batch_size = 16
test_val_batch_size = 256

model = Classifier(len(graph_builder.vocab), hidden_dim, n_heads, dataset.nclass, drop=0.7).to(torch.device('cuda:0'))

# DEPOIS TESTAR COM FOCAL-LOSS https://github.com/mbsariyildiz/focal-loss.pytorch/blob/master/focalloss.py

#loss_func = FocalLoss().to(torch.device('cuda:0'))
#loss_func = nn.NLLLoss().to(torch.device('cuda:0'))

loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
loss_eval_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

#optimizer = optim.Adam( model.parameters(), lr=0.001, weight_decay=5e-3)

optimizer = optim.RMSprop( model.parameters(), lr=0.0001, weight_decay=5e-3 )
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()
epoch_losses = []

In [103]:
#data_loader_test = DataLoader(list(zip(Gs_test, fold.y_test)), batch_size=test_val_batch_size,
#                              shuffle=True, collate_fn=collate)
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)

In [None]:
best_score = None
n_iters = 0
qtd_noised = 3
noised = False

for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                             shuffle=True, collate_fn=collate)
    epoch_loss = 0
    with tqdm_notebook(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        total = 0
        correct = 0
        model.train()
        for i, (bg, label) in enumerate(data_loader):
            if noised and i % qtd_noised == 0:
                np.random.shuffle(label)
            outputs = model(bg)
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
            loss = loss_func(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            
            qtd_docs = label.size(0)
            qtd_correct = (sampled_Y == label).sum().item()
            total += qtd_docs
            correct += qtd_correct
            
            del loss, outputs, bg, probs_Y, sampled_Y
            pbar.update( len(label) )
            print('iter {}, train loss {:.2f} train acc: {:.3f}'.format(epoch, epoch_loss / (epoch + 1), (correct/total)), end='\r')
        score_train = correct/total
    with tqdm_notebook(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
        with torch.no_grad():
            total = 0
            correct = 0
            epoch_loss = 0
            model.eval()
            for bg, label in data_loader_val:
                outputs = model(bg)
                probs_Y = torch.softmax(outputs, 1)
                sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
                #print(probs_Y.shape, sampled_Y.shape, label.shape)
                loss = loss_eval_func(outputs, label)
                epoch_loss += loss.item()
                
                qtd_docs = label.size(0)
                qtd_correct = (sampled_Y == label).sum().item()
                total += qtd_docs
                correct += qtd_correct
                
                del probs_Y, outputs, bg, sampled_Y
                pbar.update( label.size(0) )
                score_val = correct/total
                
                print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
                #break
            #break
            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
            score = correct/total
            if best_score is None or score > best_score:
                torch.save(model, PATH)
                best_score = score
                n_iters = 0
            else:
                n_iters += 1
                if n_iters >= patience:
                    print()
                    print('BEST val acc {:.3f}'.format(best_score), end='\r')
                    break
            print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3}/{} )'.format(score_val, epoch_loss, correct, total, score_val/score_train, n_iters), end='\r')
        epoch_loss /= (epoch + 1)
        epoch_losses.append(epoch_loss)
    print()

iter 0, train loss 2641.27 train acc: 0.372


val acc 0.193 val loss 24.8 ( 481/2495. over: 0.518/0 )

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


val acc 0.193 val loss 24.8 ( 481/2495. over: 0.518/0 )



iter 1, train loss 1013.45 train acc: 0.488


val acc 0.192 val loss 24.6 ( 478/2495. over: 0.393/1 )



iter 2, train loss 321.02 train acc: 0.521

In [80]:
model = torch.load(PATH)
with torch.no_grad():
    total = 0
    correct = 0
    epoch_loss = 0
    model.eval()
    for bg, label in data_loader_val:
        outputs = model(bg)
        probs_Y = torch.softmax(outputs, 1)
        sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
        #print(probs_Y.shape, sampled_Y.shape, label.shape)
        loss = loss_eval_func(outputs, label)
        epoch_loss += loss.item()

        qtd_docs = label.size(0)
        qtd_correct = (sampled_Y == label).sum().item()
        total += qtd_docs
        correct += qtd_correct

        del probs_Y, outputs, bg, sampled_Y
        pbar.update( label.size(0) )
        score_val = correct/total

        print('val acc {:.3f} val loss {:.3} ( {}/{}. over: {:.3} )'.format(score_val, epoch_loss, correct, total, score_val/score_train), end='\r')
    #break
            

val acc 0.203 val loss 22.8 ( 507/2495. over: 0.274 )

In [81]:
print( len(Gs_train), '+', len(Gs_val), end=' = ' )
Gs_train_val = Gs_train + Gs_val
print( len(Gs_train_val) )

print( len(fold.y_train), '+', len(fold.y_val), end=' = ' )
y_train_val = fold.y_train + fold.y_val
print( len(y_train_val) )

19907 + 2495 = 22402
19907 + 2495 = 22402


In [82]:
def collate2(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [83]:
data_loader_train_val_t  = DataLoader(list(zip(Gs_train_val, y_train_val)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate2)
X_train_val_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_train_val_t.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_train_val_t:
            X_train_val_t = model.transform( G ).cpu().numpy()
            X_train_val_all.append( X_train_val_t )
            pbar.update( len(label) )





In [84]:
X_train_val_all2 = np.concatenate( X_train_val_all )
X_train_val_all2

array([[-0.2178266 , -0.09567951,  0.43155062, ...,  0.01292455,
        -0.22324216, -0.58197975],
       [-0.3348459 ,  0.01925122,  0.4095029 , ...,  0.29597798,
        -0.09173325, -0.61389023],
       [-0.2497353 ,  0.0744639 ,  0.2100017 , ...,  0.19944745,
        -0.12531002, -0.115013  ],
       ...,
       [-0.17952038, -0.10148983,  0.7563992 , ...,  0.02897497,
        -0.03622607, -0.26277977],
       [ 0.180725  , -0.06271119,  0.24564497, ...,  0.00429483,
         0.05862079, -0.2594659 ],
       [-0.21408391,  0.18337913,  0.23021354, ...,  0.3431807 ,
         0.30042037, -0.45386586]], dtype=float32)

In [85]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param = {'loss': 'squared_hinge', 'C': 1, 'verbose': 0,
         'intercept_scaling': 1, 'fit_intercept': True,
         'max_iter': 1000, 'penalty': 'l2', 'multi_class': 'ovr',
         'random_state': None, 'dual': False,'tol': 0.001,
         'class_weight': None}
estimator = LinearSVC(**param)
tunning = [{'C': 2.0 ** np.arange(-5, 9, 2)}]

gs = GridSearchCV(estimator, tunning,
                n_jobs=64, refit=False,
                cv=5, iid=True,
                verbose=2, scoring='f1_micro')

gs.fit( X_train_val_all2, y_train_val )

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=64)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=64)]: Done  16 out of  35 | elapsed: 22.1min remaining: 26.2min
[Parallel(n_jobs=64)]: Done  35 out of  35 | elapsed: 28.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LinearSVC(C=1, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.001, verbose=0),
             iid=True, n_jobs=64,
             param_grid=[{'C': array([3.125e-02, 1.250e-01, 5.000e-01, 2.000e+00, 8.000e+00, 3.200e+01,
       1.280e+02])}],
             pre_dispatch='2*n_jobs', refit=False, return_train_score=False,
             scoring='f1_micro', verbose=2)

In [86]:
Gs_test  = graph_builder.transform(fold.X_test)
data_loader_test = DataLoader(list(zip(Gs_test, fold.y_test)), batch_size=test_val_batch_size,
                              shuffle=False, collate_fn=collate)
X_test_all = []
model.eval()
with torch.no_grad():
    with tqdm_notebook(total=len(data_loader_test.dataset), smoothing=0.) as pbar:
        for G, label in data_loader_test:
            X_test_t = model.transform( G ).cpu().numpy()
            X_test_all.append( X_test_t )
            pbar.update( len(label) )
X_test_all = np.concatenate( X_test_all )
X_test_all

100%|██████████| 2495/2495 [00:00<00:00, 3820.70it/s]
100%|██████████| 2495/2495 [00:07<00:00, 335.46it/s]
100%|██████████| 2495/2495 [00:01<00:00, 1676.05it/s]





array([[ 0.23167111,  0.21653058,  0.06046243, ..., -0.00840951,
         0.20930912, -0.13022415],
       [-0.21017554,  0.19031881,  0.38463926, ...,  0.29666758,
         0.2398574 , -0.41712025],
       [ 0.07892672, -0.12812844,  0.3588904 , ...,  0.20220874,
        -0.03211524, -0.2635097 ],
       ...,
       [-0.31367537, -0.36216024,  0.43182194, ...,  0.6409572 ,
        -0.09814946, -0.6874743 ],
       [ 0.01328409,  0.22750883,  0.10006183, ...,  0.164902  ,
         0.16365732, -0.52859443],
       [-0.39270324,  0.19427915, -0.05538194, ...,  0.2295712 ,
         0.31881106, -0.378664  ]], dtype=float32)

In [87]:
lsvm = LinearSVC( **gs.best_params_ )
lsvm.fit( X_train_val_all2, y_train_val )



LinearSVC(C=0.03125, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [88]:
y_pred = lsvm.predict( X_test_all )

In [89]:
sum(y_pred == fold.y_test)/len(y_pred)

0.6256513026052104

In [70]:
sum(y_pred == fold.y_test)/len(y_pred)

0.49218436873747495

In [None]:
sorted(Counter(list(map(int,list(sampled_Y)))).items())

In [None]:
sorted(Counter(list(map(int,list(label)))).items())

In [None]:
a = [1,2,3,4]
print(a)
print(a)