In [1]:
import warnings
warnings.filterwarnings('ignore')

from TGA.utils import Dataset, GraphsizePretrained
#from tqdm import tqdm
from tqdm.notebook import tqdm
from time import time
import numpy as np

Using backend: pytorch


In [2]:
%%time
graph_builder = GraphsizePretrained(w=2, verbose=True,
                   pretrained_vec='/home/Documentos/Universidade/LBD/pretrained_vectors/glove/glove.6B.300d.txt')

400000it [00:27, 14304.67it/s]


CPU times: user 27.1 s, sys: 808 ms, total: 27.9 s
Wall time: 28.2 s


In [3]:
dataset = Dataset('/home/Documentos/datasets/classification/datasets/acm/')
fold = next(dataset.get_fold_instances(10, with_val=False))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test'), 22402)

In [4]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F

import networkx as nx
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

from sklearn.preprocessing import LabelEncoder

from itertools import repeat

import torch.optim as optim
from torch.utils.data import DataLoader

In [5]:
%%time
graph_builder.fit(fold.X_train, fold.y_train)

100%|██████████| 22402/22402 [00:06<00:00, 3571.63it/s]


CPU times: user 8.09 s, sys: 64 ms, total: 8.16 s
Wall time: 8.3 s


GraphsizePretrained(encoding=None,
                    pretrained_vec='/home/Documentos/Universidade/LBD/pretrained_vectors/glove/glove.6B.300d.txt',
                    verbose=None)

In [6]:
len(graph_builder.g.edges), len(graph_builder.g)

(133170, 36302)

In [7]:
list(map(lambda x: (x,graph_builder.g.degree()[x]), graph_builder.label_ids))

[(0, 3058),
 (1, 8738),
 (2, 10723),
 (3, 16623),
 (4, 1199),
 (5, 6160),
 (6, 5394),
 (7, 15351),
 (8, 13961),
 (9, 3823),
 (10, 11860)]

In [8]:
class GenericGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim,
                 drop=.5, n_heads=8, attn_drop=.5,
                 activation=F.leaky_relu, n_convs=2,
                 first_hidden='emb', encoders={'term','label'},
                 device='cpu:0'):
        super(GenericGAT, self).__init__()
        self.device = torch.device(device)
        self.first_hidden = first_hidden
        
        self.encoders = nn.ModuleDict({
            k: nn.Linear(in_dim, hidden_dim).to(self.device) for k in encoders
        })
        #self.norm = nn.BatchNorm1d(hidden_dim).to(self.device)
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, residual=True, num_heads=n_heads, activation=activation,
                    feat_drop=drop, attn_drop=attn_drop).to(self.device) for _ in range(n_convs)
        ])
        self.down_proj = [
            nn.Linear(n_heads*hidden_dim, hidden_dim).to(self.device) for _ in range(n_convs)
        ]
        self.norm_projs = [
            nn.BatchNorm1d(hidden_dim).to(self.device) for _ in range(n_convs)
        ]
        
    def forward(self, G, **kwargs):
        with G.local_scope():
            h = G.ndata[self.first_hidden].float()
            for (k, mask) in kwargs.items():
                if k in self.encoders:
                    if mask is not None:
                        h[ mask ] = self.encoders[k]( h[ mask ] )
                    else:
                        h = self.encoders[k]( h )

            for l, conv in enumerate(self.layers):
                h = conv(G, h)
                h = h.view(h.shape[0], -1)
                h = self.down_proj[l]( h )
                h = self.norm_projs[l]( h )
        return h
        

In [10]:

class ClassifierGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, n_heads=16, drop=.5, attn_drop=.5, device='cuda:0'):
        super(ClassifierGAT, self).__init__()

        self.encoder = nn.Linear(in_dim, hidden_dim).to(torch.device(device))
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device)),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device))
        ])
        
        self.lin = nn.Linear(n_heads*hidden_dim + hidden_dim, 1).to(torch.device(device))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device(device))
        
        self.norm = nn.BatchNorm1d( n_heads*hidden_dim + hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear( n_heads*hidden_dim + hidden_dim, n_classes).to(torch.device(device))

    def forward(self, G):
        h = G.ndata['emb'].float()
        he = self.encoder(h)
        h = he
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        
        # CONCAT he E hg
        hg = torch.cat((h,he), 1)
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        
        pred = self.classify( hg )
        return pred

In [11]:
def collate(param):
    X, y = zip(*param)
    Gs_nx = graph_builder.transform(X)
    
    Gs_dgl_list = []
    for g in Gs_nx:
        g_dgl = dgl.DGLGraph()
        if len(g) > 0:
            g_dgl.from_networkx(g, node_attrs=['emb', 'idx'] )
        Gs_dgl_list.append( g_dgl )
    
    Gs_dgl = dgl.batch(Gs_dgl_list)
    
    big_graph_dgl = dgl.DGLGraph()
    big_graph_dgl.from_networkx(graph_builder.g, node_attrs=['emb', 'label', 'idx'] )
    
    #subgraph = graph_builder.g.subgraph(idx_terms)
    #big_graph_dgl.from_networkx(subgraph, node_attrs=['emb', 'label', 'idx'] )
    
    return big_graph_dgl, Gs_dgl, torch.tensor(y)

In [12]:
class TGA(torch.nn.Module):
    def __init__(self, input_l, hidden_l, nclass, n_heads=1,
                drop=0.5, attn_drop=0.5, loss=None, n_convs=1,activation=None,
                 device='cuda:0'):
        
        super(TGA, self).__init__()
        
        #self.gat_global = GenericGAT( input_l, hidden_l, n_heads=n_heads,
        #         drop=drop, attn_drop=attn_drop, n_convs=n_convs,
        #         activation=activation, device='cuda:0' ).to(device)
        
        
        self.gat_local = GenericGAT( input_l, hidden_l, n_heads=n_heads,
                 drop=drop, attn_drop=attn_drop, n_convs=n_convs, encoders={'terms'},
                 activation=activation, device='cuda:0' ).to(device)
        
        #self.norm_label = nn.BatchNorm1d(hidden_l).to(device)
        #self.norm_docs = nn.BatchNorm1d(hidden_l).to(device)

        self.gate = nn.Linear( hidden_l, 1 ).to(device)
        self.feat = nn.Linear( hidden_l, hidden_l ).to(device)
        self.gap  = GlobalAttentionPooling(self.gate, feat_nn=self.feat).to(device)
        
        #self.nclass  = nclass
        #self.fc1     = nn.Linear( hidden_l, hidden_l ).to(device)
        #self.fc2     = nn.Linear(  hidden_l//2, self.nclass ).to(device)
        #self.softmax = nn.Softmax(dim=1)
        
        self.fc_local = nn.Sequential(
          nn.Linear( hidden_l, hidden_l ),
          nn.ReLU(),
          nn.Linear( hidden_l, nclass )
        )
        
        """self.fc_global = nn.Sequential(
          nn.Linear( hidden_l, hidden_l ),
          nn.Sigmoid(),
          nn.Linear( hidden_l, hidden_l )
        )"""
        
        self.loss = loss

    def forward(self, G, gs, y, label_idx=None):
        if label_idx is None:
            label_idx = G.ndata['label'].nonzero().flatten()
            
        terms_idx = range(len(label_idx),len(graph_builder.g))
        
        #h_global  = self.gat_global(G, label=label_idx, term=terms_idx)

        #h_labels  = h_global[label_idx]
        #h_labels  = self.norm_label(h_labels)
        #h_labels  = self.fc_global(h_labels)

        #gs.ndata['emb'] = h_global[gs.ndata['idx'].reshape(-1)]
        h_local         = self.gat_local(gs, terms=None)
        h_docs          = self.gap( gs, h_local )
        #h_docs          = self.norm_docs(h_docs)
        pred_docs       = self.fc_local(h_docs)
        
        #h_docs_pred = self.fc1(h_docs)
        #h_docs_pred = self.fc2(h_docs_pred)
        #h_docs_pred = nn.softmax(h_docs_pred, 1)
        
        return h_docs, pred_docs#, h_labels

In [12]:
hidden_l = 300
input_l = 300
n_heads = 16
drop=0.5
batch_size=64
attn_drop=0.5
device=torch.device('cuda:0')

In [13]:
tga = TGA(input_l, hidden_l, nclass=graph_builder.n_class,
          activation=None,
          n_heads=n_heads, drop=drop, attn_drop=attn_drop, n_convs=2).to(device)
tga

TGA(
  (gat_local): GenericGAT(
    (encoders): ModuleDict(
      (terms): Linear(in_features=300, out_features=300, bias=True)
    )
    (layers): ModuleList(
      (0): GATConv(
        (fc): Linear(in_features=300, out_features=4800, bias=False)
        (feat_drop): Dropout(p=0.5, inplace=False)
        (attn_drop): Dropout(p=0.5, inplace=False)
        (leaky_relu): LeakyReLU(negative_slope=0.2)
        (res_fc): Identity()
      )
      (1): GATConv(
        (fc): Linear(in_features=300, out_features=4800, bias=False)
        (feat_drop): Dropout(p=0.5, inplace=False)
        (attn_drop): Dropout(p=0.5, inplace=False)
        (leaky_relu): LeakyReLU(negative_slope=0.2)
        (res_fc): Identity()
      )
    )
  )
  (gate): Linear(in_features=300, out_features=1, bias=True)
  (feat): Linear(in_features=300, out_features=300, bias=True)
  (gap): GlobalAttentionPooling(
    (gate_nn): Linear(in_features=300, out_features=1, bias=True)
    (feat_nn): Linear(in_features=300, out_feat

In [14]:
from TGA.lossweight import cross_entropy

In [15]:
from TGA.lossweight import cross_entropy
class NpairLoss(nn.Module):
    """the multi-class n-pair loss"""
    def __init__(self, l2_reg=0.02):
        super(NpairLoss, self).__init__()
        self.l2_reg = l2_reg

    def forward(self, anchor, target, positive=None):
        batch_size = anchor.size(0)
        target = target.view(target.size(0), 1)

        target = (target == torch.transpose(target, 0, 1)).float()
        target = target / torch.sum(target, dim=1, keepdim=True).float()

        if positive is not None:
            logit = torch.matmul(anchor, torch.transpose(positive, 0, 1))
            l2_loss = torch.sum(anchor**2) / batch_size + torch.sum(positive**2) / batch_size
        else:
            logit = torch.matmul(anchor, torch.transpose(anchor, 0, 1))
            l2_loss = torch.sum(anchor**2) / batch_size
        
        loss_ce = cross_entropy(logit, target)

        loss = loss_ce + self.l2_reg*l2_loss*0.25
        return loss

In [16]:

optimizer = optim.AdamW( tga.parameters(), lr=0.001, weight_decay=5e-3)

loss_func_npl = NpairLoss(l2_reg=5e-4)
loss_func_cel = nn.CrossEntropyLoss()

#RMSprop

In [None]:
from multiprocessing import set_start_method
try:
    set_start_method('spawn')
except RuntimeError:
    pass

In [17]:

best = None
nepochs = 1
for e in tqdm(range(nepochs), total=nepochs):
    epoch_loss = 0
    data_loader = DataLoader(list(zip(fold.X_train, fold.y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate, num_workers=4)
    total_loss = 0.
    with tqdm(total=len(fold.y_train), smoothing=0.) as pbar:
        total = 1
        correct = 0
        tga.train()
        for i, (G, gs, y) in enumerate(data_loader):
            G = G.to( device )
            gs = gs.to( device )
            y = y.to( device )
            
            #h_docs, pred_docs, h_labels = tga( G, gs, y )
            h_docs, pred_docs = tga( G, gs, y )
            
            pred_docs = F.softmax(pred_docs)
            
            loss1 = loss_func_cel(pred_docs, y)
            #loss2 = loss_func_npl( h_docs, y, positive=h_labels[y] )
            
            loss = loss1# + loss1
            
            loss.backward()

            optimizer.step()
            
            y_pred = pred_docs.argmax(axis=1)
            
            total_loss += loss.item()
            correct    += (y_pred == y).sum()
            total      += len(y)
            
            to_print  = f'Acc: {(1.*correct/total).item():.3}'
            #print( loss , end='\r')
            pbar.update( len(y) )
            pbar.set_description_str(f'iter {e} Loss: {total_loss/(i+1):.4} ({to_print})')
            
            #break
            if best is None or best > (total_loss/(i+1)):
                #hiddens_labels = h_labels
                hiddens_docs = h_docs
                best = total_loss/(i+1)
            #del loss, h_labels, G, gs, loss1, loss2, pred_docs, h_docs
            del loss, G, gs, loss1, pred_docs, h_docs
            #break
    del data_loader

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=22402.0), HTML(value='')))





In [18]:
h_docs_softmax.argmax(axis=1)

NameError: name 'h_docs_softmax' is not defined

In [None]:
nn.Softmax()(torch.matmul(hiddens_docs, hiddens_labels.T)).argmax(axis=1)

In [19]:
y_pred, y

(tensor([10, 10], device='cuda:0'), tensor([3, 7], device='cuda:0'))

In [None]:
loss1, loss2

In [None]:
pred_docs.argmax(axis=1), y

In [None]:
conv = GATConv(300,300,4).to(torch.device('cuda:0'))

In [None]:
bla = conv(gs, gs.ndata['emb'].float())

In [None]:
gs.ndata['emb']

In [None]:
bla.shape

In [None]:
bla.sum(axis=1).shape