In [2]:
import networkx as nx
import random
random.seed(0)
import numpy as np
np.random.seed(0)

In [3]:
G = nx.erdos_renyi_graph(10, 0.3, seed = 1, directed = False)
G

<networkx.classes.graph.Graph at 0x1e930c94700>

### Defining the `next_node` function with the list of our parameters:

In [4]:
def next_node(previous, current, p, q):
    # retrieving the list of neighboring nodes
    # from the current node and initialise the list
    # of alpha values
    neighbors = list(G.neighbors(current))
    alphas = []

    # for each neighbor, calculate appropriate alpha value ie,
    # 1/p -> if neighbor is previous node,
    # 1 -> if neighbor is connected to previous node,
    # 1/q -> otherwise
    for neighbor in neighbors:
        if neighbor == previous:
            alpha = 1/p
        elif G.has_edge(neighbor, previous):
            alpha = 1
        else:
            alpha = 1/q
        
        alphas.append(alpha)

    # now we normalise these values to create probabilities
    probs = [alpha / sum(alphas) for alpha in alphas]

    # now we randomly select the next node based on the transition
    # probabilities calculated in the previous step
    next = np.random.choice(neighbors, size=1, p = probs)[0]

    return next

before this function can be tested, we need the code to generate the entire random walk. <br>

the next node is chosen by the `next_node()`, which requires additional parameter: `p` and `q`, but also the previous and the current nodes. <br>

These nodes can be easily obtained by looking at the two last elements added to the `walk` variable. We also return strings instead of integers for compatibility reasons

In [5]:
# updated version of the random_walk() method
def random_walk(start, length, p, q):
    walk = [start]

    for i in range(length):
        current = walk[-1]
        previous = walk[-2] if len(walk) > 1 else None
        next = next_node(previous, current, p, q)
        walk.append(next)
    
    return [str(x) for x in walk]

In [6]:
random_walk(0, 8, p = 1, q = 1)

['0', '4', '7', '6', '4', '5', '4', '5', '6']

now, let's bias them toward going back to the previous node with `q = 10`

In [7]:
random_walk(0, 8, p = 1, q = 10)

['0', '9', '1', '9', '1', '9', '1', '0', '1']

This time, the random walk explores more nodes in the graph. You can see that it never goes back to the previous node because the probability is low with `p = 10`:

In [8]:
random_walk(0, 8, p = 10, q = 1)

['0', '1', '9', '4', '7', '8', '7', '4', '6']

In [9]:
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [10]:
# loading the dataset - Zachary's Karate club
G = nx.karate_club_graph()

# transforming the nodes' labels into numerical values (0 and 1):
labels = []
for node in G.nodes:
    label = G.nodes[node]['club']
    labels.append(1 if label == 'Officer' else 0)

Now we generate a list of random walks as seen previously using our `random_walk()` method 80 times for each node in the graph.

In [11]:
walks = []

for node in G.nodes:
    for _ in range(80):
        walks.append(random_walk(node, 10, 3, 2))

### Creating an instance of Word2Vec (a skip gram model) with a hierarchical `softmax` function:

In [14]:
node2vec = Word2Vec(walks,
                    hs = 1, # hierarchical softmax
                    sg = 1, # skip-gram,
                    vector_size = 100,
                    window = 10,
                    workers = 2,
                    min_count = 1,
                    seed = 0)

The skip gram model is now trained on the sequences we generated for 30 epochs:

In [15]:
node2vec.train(walks, total_examples = node2vec.corpus_count, epochs = 30, report_delay = 1)

(185807, 897600)

we now craete masks to train and test the classifier:

In [16]:
train_mask = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]
train_mask_str = [str(x) for x in train_mask]
test_mask = [0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
23, 25, 26, 27, 28, 29, 30, 31, 32, 33]
test_mask_str = [str(x) for x in test_mask]
labels = np.array(labels)

The random forest classifier is trained on the training data:

In [17]:
clf = RandomForestClassifier(random_state = 0)
clf.fit(node2vec.wv[train_mask_str], labels[train_mask]) 

We now evaluate it in terms of accuracy for the test data:

In [18]:
y_pred = clf.predict(node2vec.wv[test_mask_str])
acc = accuracy_score(y_pred, labels[test_mask])
print(f"Node2Vec accuracy = {acc * 100:.2f}%")

Node2Vec accuracy = 90.91%
