In [1]:
from utils import Dataset, GraphsizePretrained
from tqdm import tqdm
from tqdm.notebook import tqdm
from time import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Using backend: pytorch


In [2]:
webkb = Dataset('/home/mangaravite/Documentos/datasets/classification/datasets/20ng/')

dataset = webkb

In [3]:
fold = next(dataset.get_fold_instances(10))
fold._fields

('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val')

In [4]:
%%time
graph_builder = GraphsizePretrained(w=3, verbose=True, pretrained_vec='/home/mangaravite/Documentos/pretrained_vectors/glove.6B.300d.txt')
Gs_train = graph_builder.fit_transform(fold.X_train)
Gs_val   = graph_builder.transform(fold.X_val)

400000it [00:30, 13018.87it/s]
100%|██████████| 15062/15062 [00:12<00:00, 1199.79it/s]
100%|██████████| 15062/15062 [00:19<00:00, 753.81it/s]
100%|██████████| 1892/1892 [00:01<00:00, 1201.56it/s]
100%|██████████| 1892/1892 [00:02<00:00, 653.82it/s]

CPU times: user 1min, sys: 1.57 s, total: 1min 2s
Wall time: 1min 8s





In [5]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

from sklearn.preprocessing import LabelEncoder

import torch.optim as optim
from torch.utils.data import DataLoader

In [6]:
le = LabelEncoder()

y_train = le.fit_transform( fold.y_train )
y_val   = le.transform( fold.y_val )

In [7]:
label_vectors = { k: np.random.uniform( size=graph_builder.ndim ) for k in le.classes_ }
terms_vectors = graph_builder.embeddings_dict.copy()

In [8]:
class ClassifierGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, n_heads=8, drop=.5, attn_drop=.5, device='cuda:0'):
        super(ClassifierGAT, self).__init__()

        self.encoder = nn.Linear(in_dim, hidden_dim).to(torch.device(device))
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, residual=True, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device)),
            GATConv(hidden_dim, hidden_dim, residual=True, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device))
        ])

        self.down_proj = [
            nn.Linear(n_heads*hidden_dim, hidden_dim).to(torch.device(device)),
            nn.Linear(n_heads*hidden_dim, hidden_dim).to(torch.device(device))
        ]
    
    def forward(self, G):
        h = torch.FloatTensor(G.ndata['emb'])
        he = self.encoder(h)
        h = he
        for l, conv in enumerate(self.layers):
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
            h = self.down_proj[l]( h )
        # CONCAT he AND h

        return h


In [9]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    big_graph = nx.Graph()
    node_idx = { }
    for y in sorted(list(set(y_train))):
        node_idx.setdefault( ('L', y), len(node_idx) )
    
    big_graph.add_nodes_from( [ (node_idx[('L',y)], {'idx': node_idx[('L',y)], 'emb': label_vectors[y]}) for y in set(y_train)] )
    runner = list(zip(Gs_Fs, y_train))
    for i,(g,y) in enumerate(tqdm(runner, total=len(runner))):
        nodes = [ ( node_idx.setdefault((i,w), len(node_idx) ), {'idx': node_idx[(i,w)], 'emb': att['emb']}) for w,att in g.nodes(data=True) ]
        big_graph.add_nodes_from( nodes )

        w_edges = [ (node_idx.setdefault((i,s), len(node_idx) ), node_idx.setdefault((i,t), len(node_idx) )) for (s,t) in g.edges ]
        big_graph.add_edges_from( w_edges )

        filtered_nodes = [ ( node_idx.setdefault(('gt', w), len(node_idx) ),{'idx': node_idx[('gt', w)], 'emb': terms_vectors['emb']} ) for (w,att) in g.nodes(data=True) if node_idx.setdefault(('gt',w), len(node_idx) ) not in big_graph ]
        big_graph.add_nodes_from( filtered_nodes )
        
        big_graph.add_edges_from( [(node_idx[('L', y)], node_idx[('gt', w)]) for (w) in g.nodes] )
    
    
    g_dgl = dgl.DGLGraph()
    g_dgl.from_networkx(big_graph, node_attrs=['idx','emb'] )
    g_dgl.to(torch.device('cuda:0'))
    
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    
    return g_dgl, labels, node_idx, { (v,k) for (k,v) in node_idx.items() }

In [10]:
hidden_dim = 300
batch_size=128
n_heads=4

In [11]:
model = ClassifierGAT(graph_builder.ndim, hidden_dim, dataset.nclass,
                      n_heads=n_heads, drop=.5, attn_drop=.3).to(torch.device('cuda:0'))

In [12]:
loss_func = nn.CrossEntropyLoss()

optimizer = optim.Adam( model.parameters(), lr=1e-3, weight_decay=1e-3)
#optimizer = optim.AdamW( model.parameters(), lr=1e-2, weight_decay=1e-3)

#optimizer = optim.RMSprop( model.parameters(), lr=1e-2, weight_decay=1e-4)
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()

In [None]:
best_score = None
n_iters = 0
n_epochs = 3

for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, y_train)), batch_size=batch_size,
                             shuffle=False, collate_fn=collate)
    epoch_loss = 0
    with tqdm(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        total = 0
        correct = 0
        model.train()
        #  g_dgl, labels, node_idx, { (v,k) for (k,v) in node_idx.items() }
        for i, (bg, label, node_idx, idx_node) in enumerate(data_loader):
            outputs = model(bg)
            pbar.update( len(label) )
            pbar.set_description_str(f'iter {epoch}')
            break
        

HBox(children=(FloatProgress(value=0.0, max=15062.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




In [None]:
bg.ndata['emb']