In [1]:
import numpy as np
from tqdm import tqdm
import torch
from torch_geometric.data import Data

In [2]:
from tqdm import trange


with open('txt_graphs/indochina-2004.graph-txt', 'r') as f:
    line = f.readline()
    line_tot = int(line.split()[0])
    print("{} lines".format(line_tot))
    out_nodes = [0] * line_tot
    in_nodes = [0] * line_tot
    out_degree = [0] * line_tot
    for i in trange(line_tot):
        line = f.readline()
        if line[0] == '\n':
            in_ = np.array([i])
            out_ = np.array([i])
        else:
            in_ = np.fromstring(line, dtype=int, sep=' ')
            out_ = np.ones(len(in_), dtype=int) * i
        out_nodes[i] = out_
        in_nodes[i] = in_
        out_degree[i] = len(out_)


out_nodes = np.hstack(out_nodes)
in_nodes = np.hstack(in_nodes)

  0%|          | 6548/7414866 [00:00<01:53, 65479.91it/s]

7414866 lines


100%|██████████| 7414866/7414866 [01:26<00:00, 85300.80it/s] 


In [4]:
max(out_degree)

6985

In [5]:
unique_elements, in_degree = np.unique(in_nodes, return_counts=True)

In [6]:
sorted(in_degree)[-10:]

[101479,
 101500,
 101948,
 102027,
 150208,
 154369,
 176790,
 178007,
 179280,
 256426]

In [7]:
in_degree[:10]

array([1, 2, 5, 3, 6, 1, 1, 4, 2, 1])

In [10]:
np.array(out_degree).shape

(7414866,)

In [11]:
np.array(in_degree).shape

(7414677,)

In [12]:
in_nodes.shape

(195418438,)

In [13]:
out_nodes.shape

(195418438,)

In [15]:
unique_elements[-5:]

array([7414861, 7414862, 7414863, 7414864, 7414865])

In [17]:
temp = np.setdiff1d(np.arange(7414866), unique_elements)

In [18]:
temp.shape

(189,)

In [19]:
temp[:5]

array([ 51252, 308657, 643079, 784345, 894429])

In [23]:
a = np.zeros(len(out_degree))
a[unique_elements] = in_degree

In [24]:
a.shape

(7414866,)

In [25]:
edge_index = torch.tensor(np.vstack((out_nodes, in_nodes)), dtype=torch.long)
x = torch.tensor(np.vstack((out_degree, a)).T, dtype=torch.float)
data = Data(x=x, edge_index=edge_index)

In [26]:
data.num_nodes

7414866

In [27]:
data.num_edges

195418438

In [None]:
data.contains_isolated_nodes()

In [None]:
data.contains_self_loops()

In [None]:
data.is_directed()

In [28]:
def train(loader, data, model, optimizer, device='cuda'):
    model.train()
    total_loss = 0
    for subset in loader:
        optimizer.zero_grad()
        loss = model.loss(data.edge_index, subset.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [15]:
from torch.utils.data import DataLoader
from torch_geometric.nn import Node2Vec

loader = DataLoader(torch.arange(data.num_nodes), batch_size=128, shuffle=True)
device = 'cuda'
model = Node2Vec(data.num_nodes, embedding_dim=64, walk_length=20,
                 context_size=10, walks_per_node=10)
model, data = model.to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in trange(1, 20):
    loss = train(loader, data, model, optimizer)
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))

  5%|▌         | 1/19 [00:47<14:08, 47.15s/it]

Epoch: 01, Loss: 1.6546


 11%|█         | 2/19 [01:28<12:53, 45.48s/it]

Epoch: 02, Loss: 0.8097


 16%|█▌        | 3/19 [02:10<11:48, 44.28s/it]

Epoch: 03, Loss: 0.7218


 21%|██        | 4/19 [02:51<10:52, 43.49s/it]

Epoch: 04, Loss: 0.6957


 26%|██▋       | 5/19 [03:33<10:00, 42.92s/it]

Epoch: 05, Loss: 0.6849


 32%|███▏      | 6/19 [04:14<09:12, 42.48s/it]

Epoch: 06, Loss: 0.6808


 37%|███▋      | 7/19 [04:56<08:26, 42.18s/it]

Epoch: 07, Loss: 0.6785


 42%|████▏     | 8/19 [05:37<07:41, 41.97s/it]

Epoch: 08, Loss: 0.6765


 47%|████▋     | 9/19 [06:19<06:58, 41.83s/it]

Epoch: 09, Loss: 0.6718


 53%|█████▎    | 10/19 [07:00<06:15, 41.73s/it]

Epoch: 10, Loss: 0.6656


 58%|█████▊    | 11/19 [07:42<05:33, 41.66s/it]

Epoch: 11, Loss: 0.6651


 63%|██████▎   | 12/19 [08:23<04:51, 41.61s/it]

Epoch: 12, Loss: 0.6655


 68%|██████▊   | 13/19 [09:05<04:09, 41.58s/it]

Epoch: 13, Loss: 0.6662


 74%|███████▎  | 14/19 [09:46<03:27, 41.55s/it]

Epoch: 14, Loss: 0.6671


 79%|███████▉  | 15/19 [10:28<02:46, 41.54s/it]

Epoch: 15, Loss: 0.6677


 84%|████████▍ | 16/19 [11:09<02:04, 41.52s/it]

Epoch: 16, Loss: 0.6677


 89%|████████▉ | 17/19 [11:51<01:23, 41.52s/it]

Epoch: 17, Loss: 0.6668


 95%|█████████▍| 18/19 [12:32<00:41, 41.50s/it]

Epoch: 18, Loss: 0.6650


100%|██████████| 19/19 [13:14<00:00, 41.50s/it]

Epoch: 19, Loss: 0.6620





In [16]:
sub = torch.tensor([0, 1, 2])
sub = sub.to(device)
model.forward(sub)

tensor([[ 2.0825e-01,  1.3607e-01,  4.9600e-02, -6.0241e-01, -2.4672e-03,
          8.2338e-02,  1.7288e-01,  5.9137e-02, -4.6599e-02,  4.9159e-01,
         -5.9120e-01, -3.9939e-01,  4.5430e-02,  3.9232e-01,  3.9860e-01,
         -8.2821e-02,  4.7175e-01,  2.8038e-01, -6.8810e-02, -2.1342e-01,
          3.5228e-02,  1.9292e-01, -1.5499e-01, -3.8864e-01, -3.8246e-01,
          6.5634e-02,  6.4747e-01,  2.6628e-01,  3.4164e-01, -2.5770e-01,
          2.2271e-01, -3.8287e-01,  3.1442e-01, -3.3332e-01,  2.1094e-01,
         -2.5800e-01,  1.1145e-03,  3.3403e-01, -2.1289e-01, -2.0785e-01,
          4.7928e-01,  2.2601e-01,  1.7835e-01, -7.7802e-01,  1.4940e-01,
         -2.7403e-01, -1.6797e-01,  5.2571e-01,  3.8486e-01,  2.6403e-01,
          4.7272e-02, -6.3045e-01, -6.5102e-02,  4.0672e-01,  1.7066e-01,
          3.5464e-01,  1.7278e-01,  8.0207e-02,  4.3153e-01, -1.8736e-01,
         -4.3730e-01, -1.4869e-01, -3.3755e-01, -1.3141e-01],
        [-2.5264e-01, -2.4382e-01,  1.6596e-01, -2

In [17]:
data.x.shape

torch.Size([325557, 2])

In [18]:
import faiss

n = data.x.shape[0]
sub = torch.tensor([i for i in range(n)])
sub = sub.to(device)
xb = model.forward(sub).cpu().detach().numpy()
nb, d = xb.shape
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
325557


In [19]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)

[[     0     81    189    269]
 [     1     57    274    255]
 [     2      7    187    204]
 [     3     24    187    114]
 [     4 295695 297677 290495]]
[[0.        3.123289  3.244448  3.2812066]
 [0.        2.9738853 2.9919443 3.007856 ]
 [0.        3.922614  4.097231  4.4069276]
 [0.        3.733567  3.7458014 3.8158584]
 [0.        3.7408037 3.8452568 3.8463328]]


In [20]:
help(Node2Vec)

Help on class Node2Vec in module torch_geometric.nn.models.node2vec:

class Node2Vec(torch.nn.modules.module.Module)
 |  The Node2Vec model from the
 |  `"node2vec: Scalable Feature Learning for Networks"
 |  <https://arxiv.org/abs/1607.00653>`_ paper where random walks of
 |  length :obj:`walk_length` are sampled in a given graph, and node embeddings
 |  are learned via negative sampling optimization.
 |  
 |  Args:
 |      num_nodes (int): The number of nodes.
 |      embedding_dim (int): The size of each embedding vector.
 |      walk_length (int): The walk length.
 |      context_size (int): The actual context size which is considered for
 |          positive samples. This parameter increases the effective sampling
 |          rate by reusing samples across different source nodes.
 |      walks_per_node (int, optional): The number of walks to sample for each
 |          node. (default: :obj:`1`)
 |      p (float, optional): Likelihood of immediately revisiting a node in the
 |     

In [23]:
model2 = Node2Vec(data.num_nodes, embedding_dim=32, walk_length=30,
                 context_size=20, walks_per_node=10, p=1, q=2)
model2 = model2.to(device)

for epoch in trange(1, 30):
    loss = train(loader, data, model2)
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))



  0%|          | 0/29 [00:00<?, ?it/s][A[A

  3%|▎         | 1/29 [00:46<21:43, 46.55s/it][A[A

Epoch: 01, Loss: 3.6646




  7%|▋         | 2/29 [01:33<20:57, 46.56s/it][A[A

Epoch: 02, Loss: 3.6648




 10%|█         | 3/29 [02:19<20:10, 46.57s/it][A[A

Epoch: 03, Loss: 3.6646




 14%|█▍        | 4/29 [03:06<19:24, 46.57s/it][A[A

Epoch: 04, Loss: 3.6646




 17%|█▋        | 5/29 [03:52<18:38, 46.60s/it][A[A

Epoch: 05, Loss: 3.6652




 21%|██        | 6/29 [04:39<17:51, 46.57s/it][A[A

Epoch: 06, Loss: 3.6653




 24%|██▍       | 7/29 [05:25<17:04, 46.55s/it][A[A

Epoch: 07, Loss: 3.6641




 28%|██▊       | 8/29 [06:12<16:17, 46.54s/it][A[A

Epoch: 08, Loss: 3.6645




 31%|███       | 9/29 [06:58<15:30, 46.53s/it][A[A

Epoch: 09, Loss: 3.6650




 34%|███▍      | 10/29 [07:45<14:44, 46.56s/it][A[A

Epoch: 10, Loss: 3.6640




 38%|███▊      | 11/29 [08:32<13:58, 46.59s/it][A[A

Epoch: 11, Loss: 3.6649




 41%|████▏     | 12/29 [09:18<13:12, 46.60s/it][A[A

Epoch: 12, Loss: 3.6638




 45%|████▍     | 13/29 [10:05<12:25, 46.62s/it][A[A

Epoch: 13, Loss: 3.6649




 48%|████▊     | 14/29 [10:52<11:39, 46.63s/it][A[A

Epoch: 14, Loss: 3.6649




 52%|█████▏    | 15/29 [11:38<10:53, 46.65s/it][A[A

Epoch: 15, Loss: 3.6638




 55%|█████▌    | 16/29 [12:25<10:06, 46.66s/it][A[A

Epoch: 16, Loss: 3.6645




 59%|█████▊    | 17/29 [13:12<09:19, 46.65s/it][A[A

Epoch: 17, Loss: 3.6650




 62%|██████▏   | 18/29 [13:58<08:32, 46.63s/it][A[A

Epoch: 18, Loss: 3.6639




 66%|██████▌   | 19/29 [14:45<07:46, 46.64s/it][A[A

Epoch: 19, Loss: 3.6648




 69%|██████▉   | 20/29 [15:31<06:59, 46.59s/it][A[A

Epoch: 20, Loss: 3.6643




 72%|███████▏  | 21/29 [16:18<06:12, 46.56s/it][A[A

Epoch: 21, Loss: 3.6645




 76%|███████▌  | 22/29 [17:04<05:25, 46.54s/it][A[A

Epoch: 22, Loss: 3.6649




 79%|███████▉  | 23/29 [17:51<04:39, 46.52s/it][A[A

Epoch: 23, Loss: 3.6642




 83%|████████▎ | 24/29 [18:37<03:52, 46.51s/it][A[A

Epoch: 24, Loss: 3.6644




 86%|████████▌ | 25/29 [19:24<03:06, 46.51s/it][A[A

Epoch: 25, Loss: 3.6643




 90%|████████▉ | 26/29 [20:10<02:19, 46.50s/it][A[A

Epoch: 26, Loss: 3.6648




 93%|█████████▎| 27/29 [20:57<01:33, 46.51s/it][A[A

Epoch: 27, Loss: 3.6647




 97%|█████████▋| 28/29 [21:43<00:46, 46.52s/it][A[A

Epoch: 28, Loss: 3.6648




100%|██████████| 29/29 [22:30<00:00, 46.53s/it][A[A

Epoch: 29, Loss: 3.6648


In [31]:
model3 = Node2Vec(data.num_nodes, embedding_dim=32, walk_length=30,
                 context_size=20, walks_per_node=10, p=1, q=2,
                 num_negative_samples=40)
optimizer = torch.optim.Adam(model3.parameters(), lr=0.005)

model3 = model3.to(device)

for epoch in trange(1, 30):
    loss = train(loader, data, model3, optimizer)
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))







  0%|          | 0/29 [00:00<?, ?it/s][A[A[A[A[A[A





  3%|▎         | 1/29 [00:45<21:09, 45.34s/it][A[A[A[A[A[A

Epoch: 01, Loss: 1.3618








  7%|▋         | 2/29 [01:30<20:24, 45.35s/it][A[A[A[A[A[A

Epoch: 02, Loss: 0.8059








 10%|█         | 3/29 [02:16<19:39, 45.35s/it][A[A[A[A[A[A

Epoch: 03, Loss: 0.7236








 14%|█▍        | 4/29 [03:01<18:54, 45.36s/it][A[A[A[A[A[A

Epoch: 04, Loss: 0.6831








 17%|█▋        | 5/29 [03:46<18:08, 45.35s/it][A[A[A[A[A[A

Epoch: 05, Loss: 0.6679








 21%|██        | 6/29 [04:32<17:23, 45.36s/it][A[A[A[A[A[A

Epoch: 06, Loss: 0.6596








 24%|██▍       | 7/29 [05:17<16:37, 45.34s/it][A[A[A[A[A[A

Epoch: 07, Loss: 0.6554








 28%|██▊       | 8/29 [06:02<15:52, 45.34s/it][A[A[A[A[A[A

Epoch: 08, Loss: 0.6533








 31%|███       | 9/29 [06:48<15:06, 45.33s/it][A[A[A[A[A[A

Epoch: 09, Loss: 0.6518








 34%|███▍      | 10/29 [07:33<14:21, 45.34s/it][A[A[A[A[A[A

Epoch: 10, Loss: 0.6506








 38%|███▊      | 11/29 [08:18<13:36, 45.34s/it][A[A[A[A[A[A

Epoch: 11, Loss: 0.6490








 41%|████▏     | 12/29 [09:04<12:50, 45.35s/it][A[A[A[A[A[A

Epoch: 12, Loss: 0.6477








 45%|████▍     | 13/29 [09:49<12:05, 45.33s/it][A[A[A[A[A[A

Epoch: 13, Loss: 0.6459








 48%|████▊     | 14/29 [10:34<11:19, 45.33s/it][A[A[A[A[A[A

Epoch: 14, Loss: 0.6437








 52%|█████▏    | 15/29 [11:20<10:34, 45.34s/it][A[A[A[A[A[A

Epoch: 15, Loss: 0.6415








 55%|█████▌    | 16/29 [12:05<09:49, 45.33s/it][A[A[A[A[A[A

Epoch: 16, Loss: 0.6402








 59%|█████▊    | 17/29 [12:50<09:03, 45.33s/it][A[A[A[A[A[A

Epoch: 17, Loss: 0.6399








 62%|██████▏   | 18/29 [13:36<08:18, 45.35s/it][A[A[A[A[A[A

Epoch: 18, Loss: 0.6355








 66%|██████▌   | 19/29 [14:21<07:33, 45.35s/it][A[A[A[A[A[A

Epoch: 19, Loss: 0.6310








 69%|██████▉   | 20/29 [15:06<06:48, 45.36s/it][A[A[A[A[A[A

Epoch: 20, Loss: 0.6292








 72%|███████▏  | 21/29 [15:52<06:03, 45.39s/it][A[A[A[A[A[A

Epoch: 21, Loss: 0.6287








 76%|███████▌  | 22/29 [16:37<05:17, 45.42s/it][A[A[A[A[A[A

Epoch: 22, Loss: 0.6284








 79%|███████▉  | 23/29 [17:23<04:32, 45.44s/it][A[A[A[A[A[A

Epoch: 23, Loss: 0.6284








 83%|████████▎ | 24/29 [18:08<03:47, 45.45s/it][A[A[A[A[A[A

Epoch: 24, Loss: 0.6285








 86%|████████▌ | 25/29 [18:54<03:01, 45.45s/it][A[A[A[A[A[A

Epoch: 25, Loss: 0.6287








 90%|████████▉ | 26/29 [19:39<02:16, 45.44s/it][A[A[A[A[A[A

Epoch: 26, Loss: 0.6289








 93%|█████████▎| 27/29 [20:25<01:30, 45.42s/it][A[A[A[A[A[A

Epoch: 27, Loss: 0.6290








 97%|█████████▋| 28/29 [21:10<00:45, 45.43s/it][A[A[A[A[A[A

Epoch: 28, Loss: 0.6288








100%|██████████| 29/29 [21:55<00:00, 45.43s/it][A[A[A[A[A[A

Epoch: 29, Loss: 0.6286


In [54]:
def train_search(data, model, device='cuda'):
    n = data.x.shape[0]
    sub = torch.tensor([i for i in range(n)])
    sub = sub.to(device)
    # get embedding for the training nodes
    xb = model.forward(sub).cpu().detach().numpy()
    # train similarity search model
    nb, d = xb.shape
    index = faiss.IndexFlatL2(d)   # build the index
    print("Index trained: {}".format(index.is_trained))
    index.add(xb)                  # add vectors to the index
    print("Index total: {}".format(index.ntotal))
    return xb, index

In [56]:
xb, ind = train_search(data, model3)
print(xb.shape)
k = 4
D, I = ind.search(xb[:5], k) # sanity check

Index trained: True
Index total: 325557
(325557, 32)


In [51]:
I

array([[     0,     81,    189,    269],
       [     1,     57,    274,    255],
       [     2,      7,    187,    204],
       [     3,     24,    187,    114],
       [     4, 295695, 297677, 290495]])

In [57]:
model4 = Node2Vec(data.num_nodes, embedding_dim=64, walk_length=30,
                 context_size=20, walks_per_node=10, p=1, q=2,
                 num_negative_samples=50)
optimizer = torch.optim.Adam(model4.parameters(), lr=0.001)
model4 = model4.to(device)

for epoch in trange(1, 50):
    loss = train(loader, data, model4, optimizer)
    print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss))







  0%|          | 0/49 [00:00<?, ?it/s][A[A[A[A[A[A





  2%|▏         | 1/49 [00:56<45:23, 56.74s/it][A[A[A[A[A[A

Epoch: 01, Loss: 3.4679








  4%|▍         | 2/49 [01:53<44:27, 56.76s/it][A[A[A[A[A[A

Epoch: 02, Loss: 2.1328








  6%|▌         | 3/49 [02:50<43:31, 56.78s/it][A[A[A[A[A[A

Epoch: 03, Loss: 1.5820








  8%|▊         | 4/49 [03:47<42:35, 56.78s/it][A[A[A[A[A[A

Epoch: 04, Loss: 1.2647








 10%|█         | 5/49 [04:43<41:39, 56.80s/it][A[A[A[A[A[A

Epoch: 05, Loss: 1.0685








 12%|█▏        | 6/49 [05:40<40:41, 56.77s/it][A[A[A[A[A[A

Epoch: 06, Loss: 0.9428








 14%|█▍        | 7/49 [06:37<39:44, 56.78s/it][A[A[A[A[A[A

Epoch: 07, Loss: 0.8602








 16%|█▋        | 8/49 [07:34<38:48, 56.78s/it][A[A[A[A[A[A

Epoch: 08, Loss: 0.7989








 18%|█▊        | 9/49 [08:31<37:51, 56.80s/it][A[A[A[A[A[A

Epoch: 09, Loss: 0.7487








 20%|██        | 10/49 [09:27<36:53, 56.77s/it][A[A[A[A[A[A

Epoch: 10, Loss: 0.7118








 22%|██▏       | 11/49 [10:24<35:57, 56.79s/it][A[A[A[A[A[A

Epoch: 11, Loss: 0.6853








 24%|██▍       | 12/49 [11:21<35:00, 56.78s/it][A[A[A[A[A[A

Epoch: 12, Loss: 0.6665








 27%|██▋       | 13/49 [12:18<34:04, 56.79s/it][A[A[A[A[A[A

Epoch: 13, Loss: 0.6529








 29%|██▊       | 14/49 [13:15<33:07, 56.80s/it][A[A[A[A[A[A

Epoch: 14, Loss: 0.6428








 31%|███       | 15/49 [14:11<32:10, 56.79s/it][A[A[A[A[A[A

Epoch: 15, Loss: 0.6352








 33%|███▎      | 16/49 [15:08<31:14, 56.79s/it][A[A[A[A[A[A

Epoch: 16, Loss: 0.6292








 35%|███▍      | 17/49 [16:05<30:17, 56.78s/it][A[A[A[A[A[A

Epoch: 17, Loss: 0.6246








 37%|███▋      | 18/49 [17:02<29:20, 56.79s/it][A[A[A[A[A[A

Epoch: 18, Loss: 0.6208








 39%|███▉      | 19/49 [17:59<28:24, 56.81s/it][A[A[A[A[A[A

Epoch: 19, Loss: 0.6177








 41%|████      | 20/49 [18:55<27:27, 56.80s/it][A[A[A[A[A[A

Epoch: 20, Loss: 0.6151








 43%|████▎     | 21/49 [19:52<26:30, 56.82s/it][A[A[A[A[A[A

Epoch: 21, Loss: 0.6130








 45%|████▍     | 22/49 [20:49<25:33, 56.80s/it][A[A[A[A[A[A

Epoch: 22, Loss: 0.6112








 47%|████▋     | 23/49 [21:46<24:36, 56.80s/it][A[A[A[A[A[A

Epoch: 23, Loss: 0.6098








 49%|████▉     | 24/49 [22:43<23:40, 56.81s/it][A[A[A[A[A[A

Epoch: 24, Loss: 0.6085








 51%|█████     | 25/49 [23:39<22:43, 56.80s/it][A[A[A[A[A[A

Epoch: 25, Loss: 0.6073








 53%|█████▎    | 26/49 [24:36<21:45, 56.77s/it][A[A[A[A[A[A

Epoch: 26, Loss: 0.6067








 55%|█████▌    | 27/49 [25:33<20:49, 56.78s/it][A[A[A[A[A[A

Epoch: 27, Loss: 0.6062








 57%|█████▋    | 28/49 [26:30<19:52, 56.77s/it][A[A[A[A[A[A

Epoch: 28, Loss: 0.6062








 59%|█████▉    | 29/49 [27:26<18:55, 56.76s/it][A[A[A[A[A[A

Epoch: 29, Loss: 0.6059








 61%|██████    | 30/49 [28:23<17:58, 56.77s/it][A[A[A[A[A[A

Epoch: 30, Loss: 0.6047








 63%|██████▎   | 31/49 [29:20<17:01, 56.77s/it][A[A[A[A[A[A

Epoch: 31, Loss: 0.6024








 65%|██████▌   | 32/49 [30:17<16:05, 56.81s/it][A[A[A[A[A[A

Epoch: 32, Loss: 0.5998








 67%|██████▋   | 33/49 [31:14<15:09, 56.81s/it][A[A[A[A[A[A

Epoch: 33, Loss: 0.5976








 69%|██████▉   | 34/49 [32:10<14:12, 56.80s/it][A[A[A[A[A[A

Epoch: 34, Loss: 0.5958








 71%|███████▏  | 35/49 [33:07<13:15, 56.80s/it][A[A[A[A[A[A

Epoch: 35, Loss: 0.5945








 73%|███████▎  | 36/49 [34:04<12:18, 56.80s/it][A[A[A[A[A[A

Epoch: 36, Loss: 0.5932








 76%|███████▌  | 37/49 [35:01<11:21, 56.80s/it][A[A[A[A[A[A

Epoch: 37, Loss: 0.5923








 78%|███████▊  | 38/49 [35:58<10:24, 56.79s/it][A[A[A[A[A[A

Epoch: 38, Loss: 0.5914








 80%|███████▉  | 39/49 [36:54<09:28, 56.82s/it][A[A[A[A[A[A

Epoch: 39, Loss: 0.5906








 82%|████████▏ | 40/49 [37:51<08:31, 56.80s/it][A[A[A[A[A[A

Epoch: 40, Loss: 0.5898








 84%|████████▎ | 41/49 [38:48<07:34, 56.79s/it][A[A[A[A[A[A

Epoch: 41, Loss: 0.5892








 86%|████████▌ | 42/49 [39:45<06:37, 56.80s/it][A[A[A[A[A[A

Epoch: 42, Loss: 0.5886








 88%|████████▊ | 43/49 [40:42<05:40, 56.80s/it][A[A[A[A[A[A

Epoch: 43, Loss: 0.5879








 90%|████████▉ | 44/49 [41:38<04:44, 56.81s/it][A[A[A[A[A[A

Epoch: 44, Loss: 0.5873








 92%|█████████▏| 45/49 [42:35<03:47, 56.80s/it][A[A[A[A[A[A

Epoch: 45, Loss: 0.5867








 94%|█████████▍| 46/49 [43:32<02:50, 56.80s/it][A[A[A[A[A[A

Epoch: 46, Loss: 0.5861








 96%|█████████▌| 47/49 [44:29<01:53, 56.82s/it][A[A[A[A[A[A

Epoch: 47, Loss: 0.5854








 98%|█████████▊| 48/49 [45:26<00:56, 56.81s/it][A[A[A[A[A[A

Epoch: 48, Loss: 0.5846








100%|██████████| 49/49 [46:22<00:00, 56.82s/it][A[A[A[A[A[A

Epoch: 49, Loss: 0.5841


In [58]:
xb, ind = train_search(data, model3)
k = 4
D, I = ind.search(xb[:5], k) # sanity check

Index trained: True
Index total: 325557


In [59]:
I

array([[  0,   4,  75, 272],
       [  1, 121, 180,  56],
       [  2, 243, 175,  50],
       [  3, 298,  36, 272],
       [  4,   9, 208,  65]])

In [60]:
D, I = ind.search(xb[-5:], k)

In [61]:
I

array([[325552, 325551, 325550,  20317],
       [325553, 182061, 315451, 325554],
       [325554, 315451, 325553, 182061],
       [325555, 316067, 289204, 289442],
       [325556, 289145, 289345, 289213]])

In [69]:
D, I = ind.search(xb[-1].reshape(1, -1), k)

In [73]:
I.shape

(1, 4)

In [74]:
np.save('embeddings.npy', xb)

In [76]:
temp = np.load('embeddings.npy')

In [78]:
temp.shape

(325557, 32)

In [62]:
import linecache
# >>> linecache.getline('/etc/passwd', 4)

In [79]:
import linecache


def check(nodes, k, emb, ind, file):
    """ 
    emb is a 2-d numpy array of embeddings
    """
    if len(nodes) == 1:
        D, I = ind.search(emb[nodes].reshape(1, -1), k)
    else:
        D, I = ind.search(emb[nodes], k)
    for row in I:
        print("{} nearest neighbour of node {}".format(k, row[0]))
        for node in row[1:]:
            print("  " + linecache.getline(file, node + 1))