In [1]:
__author__ = 'porky-chu'

import theano
import numpy as np
from theano import tensor as TT


class NodeVectorModel(object):
    def __init__(self, n_from, n_to, de, seed=1692, init_params=None):
        """
        n_from :: number of from embeddings in the vocabulary
        n_to :: number of to embeddings in the vocabulary
        de :: dimension of the word embeddings
        """
        np.random.seed(seed)
        # parameters of the model
        if init_params is not None:
            with open('data/case_embeddings.pkl', 'rb') as f:
                temp = cPickle.load(f)
            self.Win = theano.shared(temp.Win.get_value().astype(theano.config.floatX))
            self.Wout = theano.shared(temp.Wout.get_value().astype(theano.config.floatX))
        else:
            self.Win = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_from, de)).astype(theano.config.floatX))
            self.Wout = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_to, de)).astype(theano.config.floatX))

        # adagrad
        self.cumulative_gradients_in = theano.shared(0.1 * np.ones((n_from, de)).astype(theano.config.floatX))
        self.cumulative_gradients_out = theano.shared(0.1 * np.ones((n_to, de)).astype(theano.config.floatX))

        idxs = TT.imatrix()
        x_in = self.Win[idxs[:, 0], :]
        x_out = self.Wout[idxs[:, 1], :]

        norms_in= TT.sqrt(TT.sum(x_in ** 2, axis=1))
        norms_out = TT.sqrt(TT.sum(x_out ** 2, axis=1))
        norms = norms_in * norms_out

        y = TT.vector('y')  # label
        y_predictions = TT.sum(x_in * x_out, axis=1) / norms

        # cost and gradients and learning rate
        loss = TT.mean(TT.sqr(y_predictions - y))
        gradients = TT.grad(loss, [x_in, x_out])

        updates = [
            (self.cumulative_gradients_in, TT.inc_subtensor(self.cumulative_gradients_in[idxs[:, 0]], gradients[0] ** 2)),
            (self.cumulative_gradients_out, TT.inc_subtensor(self.cumulative_gradients_out[idxs[:, 1]], gradients[1] ** 2)),
            (self.Win, TT.inc_subtensor(self.Win[idxs[:, 0]], - (0.5 / TT.sqrt(self.cumulative_gradients_in[idxs[:, 0]])) * gradients[0])),
            (self.Wout, TT.inc_subtensor(self.Wout[idxs[:, 1]], - (0.5 / TT.sqrt(self.cumulative_gradients_out[idxs[:, 1]])) * gradients[1])),
        ]

        # theano functions
        self.calculate_loss = theano.function(inputs=[idxs, y], outputs=loss)
        self.classify = theano.function(inputs=[idxs], outputs=y_predictions)
        self.train = theano.function(
            inputs=[idxs, y],
            outputs=loss,
            updates=updates,
            name='training_fn'
        )

    def __getstate__(self):
        return self.Win, self.Wout

    def __setstate__(self, state):
        Win, Wout = state
        self.Win = Win
        self.Wout = Wout

    def save_to_file(self, output_path):
        with open(output_path, 'wb') as output_file:
            #cPickle.dump(self, output_file, cPickle.HIGHEST_PROTOCOL)
            print("Save")

In [10]:
__author__ = 'allentran'

import json
import os
import multiprocessing

import numpy as np

def _update_min_dict(candidate_node, depth, min_set):
    if candidate_node in min_set:
        if min_set[candidate_node] <= depth:
            return
        else:
            min_set[candidate_node] = depth
    else:
        min_set[candidate_node] = depth

def _get_connected_nodes(node_idx, adjancency_list, max_degree, current_depth=1):
    connected_dict = {}
    single_degree_nodes = [other_idx for other_idx in adjancency_list[node_idx] if adjancency_list[node_idx][other_idx] == 1]
    for other_idx in single_degree_nodes:
        _update_min_dict(other_idx, current_depth, connected_dict)

    if current_depth <= max_degree:
        for other_node_idx in single_degree_nodes:
            if other_node_idx in adjancency_list:
                new_connected_nodes = _get_connected_nodes(other_node_idx, adjancency_list, max_degree, current_depth + 1)
                if new_connected_nodes is not None:
                    for other_idx, depth in new_connected_nodes.iteritems():
                        _update_min_dict(other_idx, depth, connected_dict)
        return connected_dict

class Graph(object):

    def __init__(self, graph_path):

        self.from_nodes_mapping = {}
        self.to_nodes_mapping = {}

        self.edge_dict = {}

        self._load_graph(graph_path=graph_path)
        self._create_mappings()

    def save_mappings(self, output_dir):
        print("wow")
        return
        with open(os.path.join(output_dir, 'from.map'), 'w') as from_map_file:
            json.dump(self.from_nodes_mapping, from_map_file)
        with open(os.path.join(output_dir, 'to.map'), 'w') as to_map_file:
            json.dump(self.to_nodes_mapping, to_map_file)

    def get_mappings(self):
        return self.from_nodes_mapping, self.to_nodes_mapping

    def _create_mappings(self):
        for key in self.edge_dict:
            self.from_nodes_mapping[key] = len(self.from_nodes_mapping)
        for to_nodes in self.edge_dict.values():
            for to_node in to_nodes:
                if to_node not in self.to_nodes_mapping:
                    self.to_nodes_mapping[to_node] = len(self.to_nodes_mapping)

    def _add_edge(self, from_idx, to_idx, degree=1):
        if from_idx not in self.edge_dict:
            self.edge_dict[from_idx] = dict()
        if to_idx in self.edge_dict[from_idx]:
            if degree >= self.edge_dict[from_idx][to_idx]:
                return
        self.edge_dict[from_idx][to_idx] = degree

    def _load_graph(self, graph_path):

        with open(graph_path, 'r') as graph_file:
            for line in graph_file:
                parsed_line = line.strip().split(' ')
                if len(parsed_line) in [2, 3]:
                    from_idx = int(parsed_line[0])
                    to_idx = int(parsed_line[1])
                    if len(parsed_line) == 3:
                        degree = int(parsed_line[2])
                        self._add_edge(from_idx, to_idx, degree)
                    else:
                        self._add_edge(from_idx, to_idx)

    def extend_graph(self, max_degree, penalty=2):

        def _zip_args_for_parallel_fn():
            for key in self.from_nodes_mapping.keys():
                yield (key, self.edge_dict, max_degree)

        from_to_idxs = []
        degrees = []

        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        connected_nodes_list = pool.map(_get_connected_nodes, _zip_args_for_parallel_fn())
        pool.close()
        pool.join()

        for node_idx, connected_nodes in zip(self.from_nodes_mapping.keys(), connected_nodes_list):
            for other_node, degree in connected_nodes.iteritems():
                from_to_idxs.append([self.from_nodes_mapping[node_idx], self.to_nodes_mapping[other_node]])
                degrees.append(float(1)/(degree ** penalty))

        return np.array(from_to_idxs).astype(np.int32), np.array(degrees).astype(np.float32)


In [3]:
__author__ = 'porky-chu'

import random
import os
import logging

import numpy as np

#from node_vectors import NodeVectorModel
#import parser


class Graph2Vec(object):
    def __init__(self, vector_dimensions, output_dir='data'):

        self.output_dir = output_dir

        self.model = None
        self.from_nodes = None
        self.to_nodes = None
        self.dimensions = vector_dimensions
        self.from_to_idxs = None
        self.inverse_degrees = None

    def parse_graph(self, graph_path, data_dir='data', load_edges=False, extend_paths=2):
        graph = Graph(graph_path)
        self.from_nodes, self.to_nodes = graph.get_mappings()
        graph.save_mappings(self.output_dir)

        if load_edges:
            self.inverse_degrees = np.memmap(
                os.path.join(data_dir, 'inverse_degrees.mat'),
                mode='r',
                dtype='float32'
            )
            self.from_to_idxs = np.memmap(
                os.path.join(data_dir, 'from_to.mat'),
                mode='r',
                dtype='int32'
            )
            self.from_to_idxs = np.reshape(self.from_to_idxs, newshape=(self.inverse_degrees.shape[0], 2))
        else:
            from_to_idxs, inverse_degrees = graph.extend_graph(max_degree=extend_paths)
            self.from_to_idxs = np.memmap(
                os.path.join(data_dir, 'from_to.mat'),
                mode='r+',
                shape=from_to_idxs.shape,
                dtype='int32'
            )
            self.from_to_idxs[:] = from_to_idxs[:]
            self.inverse_degrees = np.memmap(
                os.path.join(data_dir, 'inverse_degrees.mat'),
                mode='r+',
                shape=inverse_degrees.shape,
                dtype='float32'
            )
            self.inverse_degrees[:] = inverse_degrees[:]


    def fit(self, max_epochs=100, batch_size=1000, seed=1692, params=None):

        self.model = NodeVectorModel(
            n_from=len(self.from_nodes),
            n_to=len(self.to_nodes),
            de=self.dimensions,
            init_params=params,
        )

        random.seed(seed)
        shuffled_idxes = np.arange(self.from_to_idxs.shape[0])
        for epoch_idx in xrange(max_epochs):

            random.shuffle(shuffled_idxes)

            cost = []
            for obs_idx in xrange(0, len(self.inverse_degrees), batch_size):
                cost.append(self.model.train(self.from_to_idxs[shuffled_idxes[obs_idx:obs_idx + batch_size]],
                                          self.inverse_degrees[shuffled_idxes[obs_idx:obs_idx + batch_size]]))

            cost = np.mean(cost)
            logging.info('After %s epochs, cost=%s' % (epoch_idx, cost ** 0.5))




In [4]:
!pip install graph2vec
!pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m
[31mERROR: No matching distribution found for pickle[0m


In [5]:
import graph2vec 

help(graph2vec)

Help on package graph2vec:

NAME
    graph2vec

PACKAGE CONTENTS
    node_vectors
    parser
    trainer

AUTHOR
    porky-chu

FILE
    /usr/local/lib/python3.7/dist-packages/graph2vec/__init__.py




In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:

graph2vec =Graph2Vec(vector_dimensions=128)
graph2vec.parse_graph('/content/drive/MyDrive/Research/Sinhala NLP/Graph2Vec/data/edge.data', extend_paths=2)
graph2vec.fit(batch_size=1000, max_epochs=1000)
node2vec.model.save_to_file("/content/drive/MyDrive/Research/Sinhala NLP/Graph2Vec/data/case_embeddings.pkl")

wow


TypeError: ignored