In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import random

%pip install node2vec
from node2vec import Node2Vec
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chosen_event = 'wildfire'

In [3]:
path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

In [4]:
# let us first get access to the users of the chosen event
users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                users.append({node : data})

In [5]:
# select first a sample of 100 users
random.seed(55)
users_sample = random.sample(users, 100)

In [6]:
# extract the nodes ids in order to get a subgraph containing our sample of nodes
node_sample_ids = []
for i in range(len(users_sample)):
    for key, _ in users_sample[i].items():
        node_sample_ids.append(key)

# define the subgraph with the sample of 100 nodes
sample_graph = g.subgraph(node_sample_ids)

In [7]:
# define a Node2Vec model, and extract the walks from this model
node2vec = Node2Vec(sample_graph)
walks = node2vec.walks

# now, apply a Word2Vec model to the walks that we obtained, and set the vector size to 25 to obtain embeddings of size 25
modelw2v = Word2Vec(walks, vector_size=25)

Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 43947.02it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 2212.77it/s]


In [9]:
# obtain the embeddings our sample of 100 nodes
node_embeddings = {node: modelw2v.wv[node] for node in sample_graph.nodes()}

In [10]:
list_of_embeddings = []
for key, value in node_embeddings.items():
    list_of_embeddings.append({key : value})

In [36]:
cos_sim = [[0 for _ in range(100)] for _ in range(100)]

for i, emb_i_dict in enumerate(list_of_embeddings):
    for j, emb_j_dict in enumerate(list_of_embeddings):
        emb_i = next(iter(emb_i_dict.values())) 
        emb_j = next(iter(emb_j_dict.values()))

        cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
        cos_sim[i][j] = cosine_sim

In [38]:
arr_cos_sim = np.array(cos_sim)
np.fill_diagonal(arr_cos_sim, -np.inf)

# 10 most similar nodes
v = []

for _ in range(10):
    max_index = np.argmax(arr_cos_sim)
    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim).shape)
    arr_cos_sim[max_row_index][max_col_index] = -np.inf
    arr_cos_sim[max_col_index][max_row_index] = -np.inf
    v.append((max_row_index, max_col_index))

In [39]:
v

[(48, 81),
 (6, 48),
 (5, 80),
 (42, 68),
 (33, 61),
 (7, 57),
 (74, 80),
 (3, 46),
 (0, 67),
 (18, 44)]