In [1]:
import warnings
warnings.filterwarnings('ignore')

from utils import Dataset, GraphsizePretrained
from tqdm import tqdm
from tqdm.notebook import tqdm
from time import time
import numpy as np

Using backend: pytorch


In [2]:
dataset = Dataset('/home/mangaravite/Documentos/datasets/classification/datasets/yelp_2015/')
fold = next(dataset.get_fold_instances(10, with_val=False))
fold._fields, len(fold.X_train)

(('X_train', 'y_train', 'X_test', 'y_test'), 630000)

In [3]:
%%time
graph_builder = GraphsizePretrained(w=2, verbose=True,
                   pretrained_vec='/home/mangaravite/Documentos/pretrained_vectors/glove.6B.300d.txt')

400000it [00:27, 14498.65it/s]


CPU times: user 23.4 s, sys: 818 ms, total: 24.3 s
Wall time: 27.9 s


In [4]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
import networkx as nx
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

from sklearn.preprocessing import LabelEncoder

from itertools import repeat

import torch.optim as optim
from torch.utils.data import DataLoader

In [None]:
%%time
graph_builder.fit(fold.X_train, fold.y_train)

100%|██████████| 630000/630000 [04:18<00:00, 2436.92it/s]


In [None]:
len(graph_builder.g.edges), len(graph_builder.g)

In [None]:
list(map(lambda x: (x,graph_builder.g.degree()[x]), graph_builder.label_ids))

In [None]:
class GenericGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim,
                 drop=.5, n_heads=8, attn_drop=.5,
                 activation=F.leaky_relu, n_convs=2,
                 first_hidden='emb', encoders={'term','label'},
                 device='cpu:0'):
        super(GenericGAT, self).__init__()
        self.device = torch.device(device)
        self.first_hidden = first_hidden
        
        self.encoders = nn.ModuleDict({
            k: nn.Linear(in_dim, hidden_dim).to(self.device) for k in encoders
        })
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, residual=True, num_heads=n_heads, activation=activation,
                    feat_drop=drop, attn_drop=attn_drop).to(self.device) for _ in range(n_convs)
        ])
        self.down_proj = [
            nn.Linear(n_heads*hidden_dim, hidden_dim).to(self.device) for _ in range(n_convs)
        ]
        
    def forward(self, G, **kwargs):
        with G.local_scope():
            h = G.ndata[self.first_hidden].float()
            for (k, mask) in kwargs.items():
                if k in self.encoders:
                    if mask is not None:
                        h[ mask ] = self.encoders[k]( h[ mask ] )
                    else:
                        h = self.encoders[k]( h )

            for l, conv in enumerate(self.layers):
                h = conv(G, h)
                h = h.view(h.shape[0], -1)
                h = self.down_proj[l]( h )
        
        return h
        

In [9]:
hidden_l = 300
input_l = 300
n_heads = 1
drop=0.1
attn_drop=0.1

In [10]:
gat = GenericGAT( input_l, hidden_l, n_heads=n_heads,
                 drop=drop, attn_drop=attn_drop,
                 activation=None, device='cuda:0' ).to(torch.device('cuda:0'))
norm = nn.BatchNorm1d(hidden_l).to(torch.device('cuda:0'))

In [11]:
big_graph_dgl = dgl.DGLGraph()
big_graph_dgl.from_networkx(graph_builder.g, node_attrs=['emb', 'label', 'idx'] )
big_graph_dgl = big_graph_dgl.to(torch.device('cuda:0'))

In [12]:
loss_func = nn.CosineEmbeddingLoss(reduction='mean').to(torch.device('cuda:0'))
optimizer = optim.Adam( gat.parameters(), lr=1e-3, weight_decay=1e-3)

In [13]:
gat.train()
best = None
label_idx = big_graph_dgl.ndata['label'].nonzero().flatten()
nepochs = 100
for e in tqdm(range(nepochs), total=nepochs):
    h = gat(big_graph_dgl, label=label_idx, term=range(max(label_idx),len(graph_builder.g)))
    h = norm(h)
    labels_hiddens = h[label_idx]
    A = []
    B = []
    y = []
    for i in range(labels_hiddens.size()[0]):
        for j in range(labels_hiddens.size()[0]):
            if i != j:
                A.append( labels_hiddens[i] )
                B.append( -1.*labels_hiddens[j] )
                y.append( 1 )

    B=torch.cat(B).reshape( len(y), hidden_l ).to(torch.device('cuda:0'))
    A=torch.cat(A).reshape( len(y), hidden_l ).to(torch.device('cuda:0'))
    y=torch.Tensor(y).to(torch.device('cuda:0'))
    loss = loss_func(A, B, target=y)
    loss.backward()
    optimizer.step()
    if best is None or loss.item() < best:
        best = loss.item()
        h_best = h
    print(f"epoch: {e}, loss: {loss.item():.6} min_loss: {best:.6}")
    del labels_hiddens, loss, A, B, y

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

epoch: 0, loss: 1.80502 min_loss: 1.80502
epoch: 1, loss: 1.48892 min_loss: 1.48892
epoch: 2, loss: 1.38716 min_loss: 1.38716
epoch: 3, loss: 1.43224 min_loss: 1.38716
epoch: 4, loss: 1.36546 min_loss: 1.36546
epoch: 5, loss: 1.33608 min_loss: 1.33608
epoch: 6, loss: 1.29218 min_loss: 1.29218
epoch: 7, loss: 1.34255 min_loss: 1.29218
epoch: 8, loss: 1.49798 min_loss: 1.29218
epoch: 9, loss: 1.55063 min_loss: 1.29218
epoch: 10, loss: 1.47628 min_loss: 1.29218
epoch: 11, loss: 1.26797 min_loss: 1.26797
epoch: 12, loss: 1.22883 min_loss: 1.22883
epoch: 13, loss: 1.5424 min_loss: 1.22883
epoch: 14, loss: 1.73784 min_loss: 1.22883
epoch: 15, loss: 1.83025 min_loss: 1.22883
epoch: 16, loss: 1.86778 min_loss: 1.22883
epoch: 17, loss: 1.88311 min_loss: 1.22883
epoch: 18, loss: 1.88669 min_loss: 1.22883
epoch: 19, loss: 1.88263 min_loss: 1.22883
epoch: 20, loss: 1.88381 min_loss: 1.22883
epoch: 21, loss: 1.8828 min_loss: 1.22883
epoch: 22, loss: 1.85857 min_loss: 1.22883
epoch: 23, loss: 1.8307

In [14]:
                        # embedding dos termos novos
h_best[max(label_idx):] # o mapeamento é (o inverso de) node_mapper[big_graph.ndata['idx']]

tensor([[ 0.5354, -0.5028, -0.5080,  ..., -0.4964,  0.4754, -0.4008],
        [ 0.7030, -1.2909, -1.6529,  ..., -1.6897,  1.6250, -1.7665],
        [-0.7637,  0.8068,  0.9326,  ...,  1.0443, -0.9287,  0.9200],
        ...,
        [ 0.7127, -1.3228, -1.6368,  ..., -1.6751,  1.6088, -1.7635],
        [ 0.3317, -0.2485, -0.9960,  ..., -1.3316,  0.9007, -0.9528],
        [ 0.7035, -1.2930, -1.6516,  ..., -1.6952,  1.6156, -1.7668]],
       device='cuda:0', grad_fn=<SliceBackward>)

In [15]:
h_best[label_idx]

tensor([[ 1.1972, -1.4628, -1.6480,  ..., -1.8050,  1.4865, -1.6164],
        [-0.2668,  0.2272,  0.3561,  ...,  0.4399, -0.4981,  0.4651],
        [-1.0959,  0.7677,  0.9481,  ..., -0.3196, -1.4137,  1.3107],
        ...,
        [-0.1302, -0.9735, -0.3024,  ..., -0.3871,  0.1087, -0.0579],
        [ 1.1995, -1.3963, -1.7410,  ..., -1.7015,  1.5383, -1.6510],
        [ 0.5354, -0.5028, -0.5080,  ..., -0.4964,  0.4754, -0.4008]],
       device='cuda:0', grad_fn=<IndexBackward>)

In [16]:
h_best[100:105]

tensor([[-0.7596,  0.7932,  0.9402,  ...,  1.0531, -0.9367,  0.9215],
        [-0.7945,  0.8056,  0.9314,  ...,  1.0430, -0.9239,  0.9240],
        [-0.7648,  0.8128,  0.9231,  ...,  1.0358, -0.9195,  0.9217],
        [-0.7666,  0.8024,  0.9314,  ...,  1.0409, -0.9189,  0.9247],
        [-0.7519,  0.8076,  0.9358,  ...,  1.0398, -0.9330,  0.9264]],
       device='cuda:0', grad_fn=<SliceBackward>)

In [17]:
big_graph_dgl.ndata['emb'][100:105]

tensor([[-0.0614,  0.8113, -0.3445,  ..., -0.5139,  0.1250,  0.2937],
        [ 0.2751,  0.5761,  0.0962,  ..., -0.3611, -0.0418, -0.3528],
        [-0.4215,  0.6595,  0.2708,  ..., -0.5453,  0.2746, -0.1018],
        [ 0.1521,  0.5283,  0.4525,  ..., -0.4160,  0.2137, -0.1511],
        [ 0.3702, -0.2849,  0.7021,  ..., -0.6174, -0.0713, -0.2659]],
       device='cuda:0', dtype=torch.float64)

In [18]:
zdfgsrvsdtvhsfhvg

NameError: name 'zdfgsrvsdtvhsfhvg' is not defined

In [None]:
from matplotlib.pyplot import scatter

In [None]:
x,y = h_best[label_idx].detach().numpy().T
scatter(x,y)

In [None]:

x,y = h_best[max(label_idx):].detach().numpy().T
scatter(x,y)

In [None]:
label_idx = big_graph_dgl.ndata['label'].nonzero().flatten()

In [None]:
h = gat(big_graph_dgl)
h.shape

In [None]:
labels_hiddens = h[label_idx]
labels_hiddens

In [None]:
A = []
B = []
y = []
for i in range(labels_hiddens.size()[0]):
    for j in range(labels_hiddens.size()[0]):
        if i != j:
            A.append( labels_hiddens[i] )
            B.append( -1.*labels_hiddens[j] )
            y.append( 1 )
            
B=torch.cat(B).reshape( len(y), 300 )
A=torch.cat(A).reshape( len(y), 300 )
y=torch.Tensor(y)

In [None]:
loss_func(B, A, target=y), loss_func(A, B, target=y)

In [None]:
gat.train()
torch.cuda.synchronize()

In [None]:
class TGA(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_class,
                  n_heads=8, drop=.5, attn_drop=.5,
                  device='cuda:0'):
        super(TGA, self).__init__()
        self.n_class = n_class
        self.device = torch.device(device)
        self.gat_global = GenericGAT(in_dim, hidden_dim, 
                                     encoders={'label'}, 
                                     n_heads=n_heads, drop=drop,
                                     attn_drop=attn_drop, device=self.device)
        
        self.gat_local  = GenericGAT(hidden_dim, hidden_dim, 
                                     encoders={'term'}, 
                                     n_heads=n_heads, drop=drop,
                                     first_hidden='emb',
                                     attn_drop=attn_drop, device=self.device)

        self.lin = nn.Linear( hidden_dim, 1).to(self.device)
        # Depois tentar alguma ativação (ReLU, por exemplo, pode "desativar" alguns termos no softmax)
        self.pooling = GlobalAttentionPooling( self.lin ).to(self.device)

        # Fully Connected
        self.fc1 = nn.Linear( hidden_dim, hidden_dim//2).to(self.device)
        self.fc2 = nn.Linear( hidden_dim//2, hidden_dim//4).to(self.device)
        self.fc3 = nn.Linear( hidden_dim//4, self.n_class).to(self.device)
    def forward(self, G, gs):
        #h_global           = self.gat_global( G, label=G.ndata['label'].nonzero().flatten() )
        #gs.ndata['weight'] = h_global[ gs.ndata['idx'] ] # Tentar concatenando
        h_local            = self.gat_local(gs, term=None)
        #h_local            = torch.cat((h_local, h_global[ gs.ndata['idx'] ]), 1)
        h_local            = self.pooling( gs, h_local )
        h_local            = self.fc1( h_local )
        h_local            = self.fc2( h_local )
        h_local            = self.fc3( h_local )
        return h_local
# torch.Size([3652, 300]) torch.Size([3652, 300]) torch.Size([128, 300])
        

In [None]:
in_dim=300
hidden_dim=2
n_heads=8
drop=0.3
attn_drop=0.5
batch_size=128
device='cuda:0'

In [None]:
model = TGA( in_dim, hidden_dim, graph_builder.n_class,
            n_heads=n_heads, drop=drop, attn_drop=attn_drop )
model

In [None]:
def collate(param):
    X, y = zip(*param)
    Gs_nx = graph_builder.transform(X)
    
    Gs_dgl_list = []
    idx_terms = { l for l in graph_builder.label_ids }
    for g in Gs_nx:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g, node_attrs=['emb', 'idx'] )
        Gs_dgl_list.append( g_dgl )
        
        idx_terms = idx_terms.union( set(nx.get_node_attributes(g,'idx').values()) )
    
    Gs_dgl = dgl.batch(Gs_dgl_list)
    
    big_graph_dgl = dgl.DGLGraph()
    big_graph_dgl.from_networkx(graph_builder.g, node_attrs=['emb', 'label', 'idx'] )
    
    #subgraph = graph_builder.g.subgraph(idx_terms)
    #big_graph_dgl.from_networkx(subgraph, node_attrs=['emb', 'label', 'idx'] )
    
    return big_graph_dgl, Gs_dgl, torch.tensor(y)

In [None]:
loss_func = nn.CrossEntropyLoss()

optimizer = optim.Adam( model.parameters(), lr=1e-3, weight_decay=1e-3)
#optimizer = optim.AdamW( model.parameters(), lr=1e-2, weight_decay=1e-3)

#optimizer = optim.RMSprop( model.parameters(), lr=1e-2, weight_decay=1e-4)
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()

In [None]:
n_epochs = 10

for epoch in range(n_epochs):
    epoch_loss = 0
    data_loader = DataLoader(list(zip(fold.X_train, fold.y_train)), batch_size=batch_size,
                             shuffle=True, collate_fn=collate, num_workers=2)
    with tqdm(total=len(fold.y_train)) as pbar:
        total = 1
        correct = 1
        model.train()
        for G, gs, y in data_loader:
            G = G.to( torch.device('cuda:0') )
            gs = gs.to( torch.device('cuda:0') )
            y = y.to( torch.device('cuda:0') )
            
            optimizer.zero_grad()
            
            outputs = model( G, gs )
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
            
            total += y.size(0)
            correct += (sampled_Y == y).sum().item()
            
            del probs_Y, sampled_Y, G, gs
            
            # NN backprop phase
            loss = loss_func(outputs, y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            
            pbar.update( len(y) )
            pbar.set_description_str(f'iter {epoch} Acc train: {correct/total:.3}')

In [None]:
g = nx.Graph()
g.add_nodes_from( [ (0, {'idx': 0}), (1, {'idx': 1}), (2, {'idx': 2}) ] )

In [None]:
list(nx.get_node_attributes(g,'idx').values())

In [None]:
outputs

In [None]:
labels_hiddens = torch.eye( 11 )
labels_hiddens