In [1]:
import warnings
warnings.filterwarnings('ignore')

from utils import Dataset, GraphsizePretrained
from tqdm import tqdm
from tqdm.notebook import tqdm
from time import time
import numpy as np

Using backend: pytorch


In [2]:
%%time
graph_builder = GraphsizePretrained(w=1, verbose=True,
                   pretrained_vec='/home/mangaravite/Documentos/pretrained_vectors/glove.6B.300d.txt')

400000it [00:22, 17778.67it/s]


CPU times: user 22.2 s, sys: 596 ms, total: 22.8 s
Wall time: 22.7 s


In [3]:
dataset = Dataset('/home/mangaravite/Documentos/datasets/classification/datasets/acm/')
fold = next(dataset.get_fold_instances(10, with_val=False))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test'), 22402)

In [4]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F

import networkx as nx
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

from sklearn.preprocessing import LabelEncoder

from itertools import repeat

import torch.optim as optim
from torch.utils.data import DataLoader

In [5]:
%%time
graph_builder.fit(fold.X_train, fold.y_train)

100%|██████████| 22402/22402 [00:05<00:00, 4111.92it/s]


CPU times: user 6.76 s, sys: 87.7 ms, total: 6.85 s
Wall time: 6.83 s


GraphsizePretrained(pretrained_vec='/home/mangaravite/Documentos/pretrained_vectors/glove.6B.300d.txt',
                    verbose=None, w=1)

In [6]:
len(graph_builder.g.edges), len(graph_builder.g)

(133170, 36302)

In [7]:
list(map(lambda x: (x,graph_builder.g.degree()[x]), graph_builder.label_ids))

[(0, 3058),
 (1, 8738),
 (2, 10723),
 (3, 16623),
 (4, 1199),
 (5, 6160),
 (6, 5394),
 (7, 15351),
 (8, 13961),
 (9, 3823),
 (10, 11860)]

In [8]:
class GenericGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim,
                 drop=.5, n_heads=8, attn_drop=.5,
                 activation=F.leaky_relu, n_convs=2,
                 first_hidden='emb', encoders={'term','label'},
                 device='cpu:0'):
        super(GenericGAT, self).__init__()
        self.device = torch.device(device)
        self.first_hidden = first_hidden
        
        self.encoders = nn.ModuleDict({
            k: nn.Linear(in_dim, hidden_dim).to(self.device) for k in encoders
        })
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, residual=True, num_heads=n_heads, activation=activation,
                    feat_drop=drop, attn_drop=attn_drop).to(self.device) for _ in range(n_convs)
        ])
        self.down_proj = [
            nn.Linear(n_heads*hidden_dim, hidden_dim).to(self.device) for _ in range(n_convs)
        ]
        
    def forward(self, G, **kwargs):
        with G.local_scope():
            h = G.ndata[self.first_hidden].float()
            for (k, mask) in kwargs.items():
                if k in self.encoders:
                    if mask is not None:
                        h[ mask ] = self.encoders[k]( h[ mask ] )
                    else:
                        h = self.encoders[k]( h )

            for l, conv in enumerate(self.layers):
                h = conv(G, h)
                h = h.view(h.shape[0], -1)
                h = self.down_proj[l]( h )
                #h = F.relu( h )
        return h
        

In [9]:
g = nx.Graph()
g.add_node(1, weight=1)
g.nodes[1]['weight']

1

In [10]:
def collate(param):
    X, y = zip(*param)
    Gs_nx = graph_builder.transform(X)
    
    Gs_dgl_list = []
    for g in Gs_nx:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g, node_attrs=['emb', 'idx'] )
        Gs_dgl_list.append( g_dgl )
    
    Gs_dgl = dgl.batch(Gs_dgl_list)
    
    big_graph_dgl = dgl.DGLGraph()
    big_graph_dgl.from_networkx(graph_builder.g, node_attrs=['emb', 'label', 'idx'] )
    
    #subgraph = graph_builder.g.subgraph(idx_terms)
    #big_graph_dgl.from_networkx(subgraph, node_attrs=['emb', 'label', 'idx'] )
    
    return big_graph_dgl, Gs_dgl, torch.tensor(y)

In [11]:
class CosineDistanceLoss(torch.nn.Module):
    def __init__(self, reduction='mean', alpha=-1.):
        super(CosineDistanceLoss, self).__init__()

        self.loss = nn.CosineEmbeddingLoss(reduction=reduction)
        self.alpha = alpha

    def forward(self, X, Y=None, y_idxs=None):
        nsmpl, ndims = X.shape
        A = []
        B = []
        target = []
        if Y is not None and y_idxs is not None:
            for (x,y_idx) in zip(X, y_idxs):
                for i,y in enumerate(Y):
                    A.append( x )
                    B.append( y )
                    target.append( 1 if i == y_idx.item() else -1 )
            
        else:
            for i in range(nsmpl):
                for j in range(i+1, nsmpl):
                    A.append( X[i] )
                    B.append( self.alpha*X[j] )
                    target.append( 1 )

        A=torch.cat(A).reshape( len(target), ndims ).to(X.device)
        B=torch.cat(B).reshape( len(target), ndims ).to(X.device)
        target=torch.Tensor(target).to(X.device)
            
        
        return self.loss(A, B, target=target)


In [12]:
class TGA(torch.nn.Module):
    def __init__(self, input_l, hidden_l, n_heads=1,
                drop=0.5, attn_drop=0.5,
                 device='cuda:0'):
        
        super(TGA, self).__init__()
        self.gat = GenericGAT( input_l, hidden_l, n_heads=n_heads,
                 drop=drop, attn_drop=attn_drop,
                 activation=None, device='cuda:0' ).to(device)
        self.norm = nn.BatchNorm1d(hidden_l).to(device)

        self.gate = nn.Linear( hidden_l, 1 ).to(device)
        self.feat = nn.Linear( hidden_l, hidden_l ).to(device)
        self.gap = GlobalAttentionPooling(self.gate, feat_nn=self.feat).to(device)

    def forward(self, G, gs, y, label_idx=None):
        if label_idx is None:
            label_idx = G.ndata['label'].nonzero().flatten()

        h_global = self.gat(G, label=label_idx, term=range(len(label_idx),len(graph_builder.g)))
        h_global = self.norm(h_global)

        labels_hiddens = h_global[label_idx]

        h_local = self.gap( gs, h_global[gs.ndata['idx'].reshape(-1)] )
        
        return h_local, h_global

In [13]:
hidden_l = 300
input_l = 300
n_heads = 2
drop=0.5
batch_size=32
attn_drop=0.5
device=torch.device('cuda:0')

In [18]:
tga = TGA(input_l, hidden_l, n_heads=n_heads, drop=drop, attn_drop=attn_drop).to(device)
tga

TGA(
  (gat): GenericGAT(
    (encoders): ModuleDict(
      (label): Linear(in_features=300, out_features=300, bias=True)
      (term): Linear(in_features=300, out_features=300, bias=True)
    )
    (layers): ModuleList(
      (0): GATConv(
        (fc): Linear(in_features=300, out_features=600, bias=False)
        (feat_drop): Dropout(p=0.5, inplace=False)
        (attn_drop): Dropout(p=0.5, inplace=False)
        (leaky_relu): LeakyReLU(negative_slope=0.2)
        (res_fc): Identity()
      )
      (1): GATConv(
        (fc): Linear(in_features=300, out_features=600, bias=False)
        (feat_drop): Dropout(p=0.5, inplace=False)
        (attn_drop): Dropout(p=0.5, inplace=False)
        (leaky_relu): LeakyReLU(negative_slope=0.2)
        (res_fc): Identity()
      )
    )
  )
  (norm): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gate): Linear(in_features=300, out_features=1, bias=True)
  (feat): Linear(in_features=300, out_features=300, bias=Tr

In [19]:
loss_func_global = CosineDistanceLoss(reduction='mean', alpha=-0.5).to(device)
loss_func_local  = CosineDistanceLoss(reduction='mean').to(device)

optimizer = optim.AdamW( tga.parameters(), lr=5e-3, weight_decay=5e-3)


#RMSprop

In [None]:

best = None
nepochs = 10
lr = 2.*5e-2
best_loss = None
best_loss_local = None
best_loss_global = None
for e in tqdm(range(nepochs), total=nepochs):
    epoch_loss = 0
    data_loader = DataLoader(list(zip(fold.X_train, fold.y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate, num_workers=2)
    with tqdm(total=len(fold.y_train)) as pbar:
        total = 1
        correct = 1
        tga.train()
        for G, gs, y in data_loader:
            G = G.to( device )
            gs = gs.to( device )
            y = y.to( device )
            
            h_local, h_global = tga( G, gs, y )
            
            label_idx = G.ndata['label'].nonzero().flatten()
            labels_hiddens = h_global[label_idx]
            
            loss_global = loss_func_global(labels_hiddens)
            
            loss_local = loss_func_local(h_local, labels_hiddens, y)
            
            
            loss = loss_global + loss_local
            loss.backward()

            optimizer.step()
    
            if best_loss is None or loss.item() < best_loss:
                best_loss = loss.item()
                best_loss_local = loss_local.item()
                best_loss_global = loss_global.item()
                best_h_global = h_global.cpu()
            
            
            to_print = f'local loss: {loss_local:.3}({best_loss_local:.3}), '
            to_print += f'global loss: {loss_global-1.:.3}({best_loss_global-1.:.3}), '
            to_print += f'loss: {loss-1.:.3}({best_loss-1.:.3})             '
            
            print( to_print , end='\r')
            pbar.update( len(y) )
            pbar.set_description_str(f'iter {e} Best Loss: {best_loss-1.:.3}')
            
            del loss, loss_local, loss_global, labels_hiddens, label_idx, G, gs, y, h_local, h_global

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=22402.0), HTML(value='')))

local loss: 0.12(0.103), global loss: -0.0413(-0.0486), loss: 0.079(0.0541)                 

In [17]:
best_h_global

NameError: name 'h_global' is not defined

In [None]:
gs.ndata['idx']

In [None]:
gat.train()
best = None
label_idx = big_graph_dgl.ndata['label'].nonzero().flatten()
nepochs = 200
lr = 2.*5e-2
for e in tqdm(range(nepochs), total=nepochs):
    if e % 50 == 1:
        gat = best_model
        optimizer = optim.AdamW( gat.parameters(), lr=lr/2, weight_decay=5e-3)
    h = gat(big_graph_dgl, label=label_idx, term=range(len(label_idx),len(graph_builder.g)))
    h = norm(h)
    labels_hiddens = h[label_idx]
    
    loss = loss_func(labels_hiddens)
    loss.backward()
    
    optimizer.step()
    
    if best is None or loss.item() < best:
        best = loss.item()
        best_model = gat
        h_best = h.detach().cpu()
    
    print(f"epoch: {e}, loss: {loss.item()-1:.6} min_loss: {best-1:.6}")
    del labels_hiddens, loss, h
    if (best-1.) < -0.8:
        break

In [None]:
h_best[label_idx] # Embedding dos labels (Distâncias maximizadas)

In [None]:
                        # embedding dos termos novos
h_best[len(label_idx):] # o mapeamento é (o inverso de) node_mapper[big_graph.ndata['idx']]

In [None]:
h_best[100:105]

In [None]:
big_graph_dgl.ndata['emb'][100:105]

In [None]:
zdfgsrvsdtvhsfhvg

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import scatter

In [None]:
# Variância média DAS hidden dimensions dos labels &
#     -> O quanto as hidden estão variando entre as labels
#        Quanto maior esse valor, mais "diferente" são as representações dos LABELS
# Variância média ENTRE A MÉDIA DAS hidden dimensions dos labels
#     -> O quanto as hiddens estão variando entre si
#        Quanto maior esse valor, mais "diferente" são as representações das DIMENSÕES
#        Ou seja, VAR(AVG(h1), AVG(h2), ..., AVG(hn)) é alto
h_best[label_idx].std(axis=0).mean(), h_best[label_idx].mean(axis=0).std()

In [None]:
nbins = 30

In [None]:
plt.hist( h_best[label_idx].mean(axis=0).detach().cpu().numpy(), bins=nbins )
plt.title( 'Labels\n[VAR(h1), VAR(h2), ..., VAR(hn)]' )

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(verbose=10, metric='cosine')
X = tsne.fit_transform(h_best.detach().cpu().numpy())

In [None]:
dataset.dname

In [None]:
x,y = X.T
plt.figure(figsize=(10,8))
plt.scatter(x[len(label_idx):],y[len(label_idx):], linewidths= 0.0, s=25, alpha=0.025)
plt.scatter(x[label_idx.cpu().numpy()],y[label_idx.cpu().numpy()], marker='x')
plt.xlim( (x.min()-5, x.max()+5) )
plt.ylim( (y.min()-5, y.max()+5) )
plt.title(f'{dataset.dname} - heads={n_heads}')
plt.savefig(f'{dataset.dname}-heads={n_heads}_diff.png')

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(verbose=10)
X2 = tsne.fit_transform(big_graph_dgl.ndata['emb'][len(label_idx):].detach().cpu().numpy())

In [None]:
x,y = X2.T
plt.scatter(x, y, linewidths= 0.0, alpha=0.01)
plt.xlim( (x.min()-5, x.max()+5) )
plt.ylim( (y.min()-5, y.max()+5) )

In [None]:
x[label_idx.cpu().numpy()],y[label_idx.cpu().numpy()]

In [None]:
plt.hist( h_best[len(label_idx):].std(axis=0).detach().cpu().numpy(), bins=nbins )
plt.title( 'Terms\n[VAR(h1), VAR(h2), ..., VAR(hn)]' )

In [None]:
plt.hist( h_best[label_idx].mean(axis=0).detach().cpu().numpy(), bins=nbins )
plt.title( 'Labels\n[AVG(h1), AVG(h2), ..., AVG(hn)]' )

In [None]:
plt.hist( h_best[len(label_idx):].mean(axis=0).detach().cpu().numpy(), bins=nbins )
plt.title( 'Terms\n[AVG(h1), AVG(h2), ..., AVG(hn)]' )

In [None]:
label_idx = big_graph_dgl.ndata['label'].nonzero().flatten()

In [None]:
h = gat(big_graph_dgl)
h.shape

In [None]:
labels_hiddens = h[label_idx]
labels_hiddens

In [None]:
A = []
B = []
y = []
for i in range(labels_hiddens.size()[0]):
    for j in range(labels_hiddens.size()[0]):
        if i != j:
            A.append( labels_hiddens[i] )
            B.append( -1.*labels_hiddens[j] )
            y.append( 1 )
            
B=torch.cat(B).reshape( len(y), 300 )
A=torch.cat(A).reshape( len(y), 300 )
y=torch.Tensor(y)

In [None]:
loss_func(B, A, target=y), loss_func(A, B, target=y)

In [None]:
gat.train()
torch.cuda.synchronize()

In [None]:
class TGA(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_class,
                  n_heads=8, drop=.5, attn_drop=.5,
                  device='cuda:0'):
        super(TGA, self).__init__()
        self.n_class = n_class
        self.device = torch.device(device)
        self.gat_global = GenericGAT(in_dim, hidden_dim, 
                                     encoders={'label'}, 
                                     n_heads=n_heads, drop=drop,
                                     attn_drop=attn_drop, device=self.device)
        
        self.gat_local  = GenericGAT(hidden_dim, hidden_dim, 
                                     encoders={'term'}, 
                                     n_heads=n_heads, drop=drop,
                                     first_hidden='emb',
                                     attn_drop=attn_drop, device=self.device)

        self.lin = nn.Linear( hidden_dim, 1).to(self.device)
        # Depois tentar alguma ativação (ReLU, por exemplo, pode "desativar" alguns termos no softmax)
        self.pooling = GlobalAttentionPooling( self.lin ).to(self.device)

        # Fully Connected
        self.fc1 = nn.Linear( hidden_dim, hidden_dim//2).to(self.device)
        self.fc2 = nn.Linear( hidden_dim//2, hidden_dim//4).to(self.device)
        self.fc3 = nn.Linear( hidden_dim//4, self.n_class).to(self.device)
    def forward(self, G, gs):
        #h_global           = self.gat_global( G, label=G.ndata['label'].nonzero().flatten() )
        #gs.ndata['weight'] = h_global[ gs.ndata['idx'] ] # Tentar concatenando
        h_local            = self.gat_local(gs, term=None)
        #h_local            = torch.cat((h_local, h_global[ gs.ndata['idx'] ]), 1)
        h_local            = self.pooling( gs, h_local )
        h_local            = self.fc1( h_local )
        h_local            = self.fc2( h_local )
        h_local            = self.fc3( h_local )
        return h_local
# torch.Size([3652, 300]) torch.Size([3652, 300]) torch.Size([128, 300])
        

In [None]:
in_dim=300
hidden_dim=2
n_heads=8
drop=0.3
attn_drop=0.5
batch_size=128
device='cuda:0'

In [None]:
model = TGA( in_dim, hidden_dim, graph_builder.n_class,
            n_heads=n_heads, drop=drop, attn_drop=attn_drop )
model

In [None]:
loss_func = nn.CrossEntropyLoss()

optimizer = optim.Adam( model.parameters(), lr=1e-3, weight_decay=1e-3)
#optimizer = optim.AdamW( model.parameters(), lr=1e-2, weight_decay=1e-3)

#optimizer = optim.RMSprop( model.parameters(), lr=1e-2, weight_decay=1e-4)
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()

In [None]:
g = nx.Graph()
g.add_nodes_from( [ (0, {'idx': 0}), (1, {'idx': 1}), (2, {'idx': 2}) ] )

In [None]:
list(nx.get_node_attributes(g,'idx').values())

In [None]:
outputs

In [None]:
labels_hiddens = torch.eye( 11 )
labels_hiddens