In [10]:
import numpy as np

from graph_emb.classify import read_node_label, Classifier
from graph_emb import DeepWalk
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN

In [11]:
def evaluate_embeddings(embeddings):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')
    tr_frac = 0.8
    print("Training classifier using {:.2f}% nodes...".format(
        tr_frac * 100))
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)


def plot_embeddings(embeddings,):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')

    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()

In [12]:
# read contrusted graph
G = nx.read_gpickle("./our_data/graph.gpickle")

In [15]:
# 序列长度，xxx，并行worker数量
model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.7s finished


In [16]:
model.train(window_size=5, iter=3) 
embeddings = model.get_embeddings()

Learning embedding vectors...
Learning embedding vectors done!


In [17]:
train_X = []
train_X_id = []

for k, v in embeddings.items():
    train_X.append(v)
    train_X_id.append(v)

train_X = np.array(train_X)

In [20]:
print("train_X len = ", len(train_X))
print(train_X)

train_X len =  3996
[[-0.1507671   0.09657414  0.1083119  ... -0.857874    0.03231663
   0.42315993]
 [-0.28789544  0.23474503 -0.13886695 ... -0.52492607  0.23320611
   0.3344284 ]
 [-0.4388554   0.344467   -0.362183   ... -0.24821089  0.22775418
   0.35269183]
 ...
 [ 0.1724544   0.23932064  0.24112993 ... -0.3224575   0.3790833
   0.8264161 ]
 [ 0.11183061  0.42790264  0.55801135 ... -0.2731316   0.25836962
   0.75040585]
 [ 0.19572607  0.32381853  0.82817644 ... -0.3551053   0.24832176
   0.70051306]]


In [21]:
clustering = DBSCAN().fit(train_X)
# evaluate_embeddings(embeddings)
# plot_embeddings(embeddings)

In [22]:
print(type(clustering))

<class 'sklearn.cluster._dbscan.DBSCAN'>
