In [5]:
# get the adjacency coordinates and turn them into matrix
# get the idxs for adjacency (using id_inverse_map) and creat x, tx, y, ty, allx, ally, 
# make sure both test, train data comes with their id_map (loss of data in dynamic training)
#### 
"""
Loads input data from gcn/data directory

ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
    (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
    object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

All objects above must be saved using python pickle module.

:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
import numpy as np
import json
import pickle as pkl
import networkx as nx
from collections import defaultdict
from scipy.sparse.csr import csr_matrix
name = 'Amazon'
path = './' + name
max_nodes = 15



id_file = open(path + '/Amazon_user_id_map.txt','r')
id_map = json.load(id_file)

# id_file_inverse = open(path + '/Amazon_user_id_map_inverse.txt', 'r')
id_map_inverse = dict([(value, key) for key, value in id_map.items()]) 
 
graph_neighbor_file = open(path + '/group_user_tight_adj.json','r')
graph_neighbor_dic = json.load(graph_neighbor_file)
features = np.loadtxt(path + '/Amazon_node_attributes.txt',delimiter=',')
adj_mat = np.loadtxt(path+ '/Amazon_A.txt',delimiter=',').astype(int)
labels = np.loadtxt(path+ '/Amazon_node_labels.txt', delimiter=',').astype(int)
graph_indicator = np.loadtxt(path+'/Amazon_graph_indicator.txt', delimiter=',').astype(int)
graph_labels = np.loadtxt(path+'/Amazon_graph_labels.txt', delimiter=',').astype(int)
data_tuple = list(map(tuple, adj_mat))
graph_num = graph_indicator.max()

group_list = [[] for _ in range(graph_num+1)]
graph_idx = 0
id_map_inverse_new = {}
features_list = []
labels_list = []
labels2d_list = []
for idx, feature  in enumerate(features):
    user_temp = group_list[graph_indicator[idx]]
    if len(user_temp) <= max_nodes:
        graph_idx += 1
        group_list[graph_indicator[int(idx)]].append(graph_idx)
        id_map_inverse_new[graph_idx] = id_map_inverse[int(idx+1)]
        features_list.append(feature)
        labels_list.append(labels[idx])
        label2d = [0 for _ in range(2)]
        label2d[labels[idx]] = 1
        labels2d_list.append(label2d)
# create graph neighbor dic        
id_map_new = {value:key for key, value in id_map_inverse_new.items()}
graph_dic = {} #
for key, value in graph_neighbor_dic.items():
    if key in id_map_new:
        temp = []
        flag = False
        for item in value:
            if str(item) in id_map_new:
                flag = True
                temp.append(id_map_new[str(item)]-1)
        if flag == True:            
            graph_dic[int(id_map_new[key]-1)] = temp
# create the user cut off map dic
cut_off_idx = 0
user_cut_off_map = {}
for key, value in graph_dic.items():
    if key not in user_cut_off_map:
        user_cut_off_map[key] = cut_off_idx
        cut_off_idx += 1
    for user_id in value:
        if user_id not in user_cut_off_map:
            user_cut_off_map[user_id] = cut_off_idx
            cut_off_idx += 1
            
# create features and labels list
features_cut_off_list = [[] for _ in range(cut_off_idx)]
labels_cut_off_list = [[] for _ in range(cut_off_idx)]
for key, value in user_cut_off_map.items():
    features_cut_off_list[value] = features_list[key]
    labels_cut_off_list[value] = labels2d_list[key]
# create graph neighbor cut off dic    
graphs_cut_off_dic = {}            
for key, value in graph_dic.items():
    graphs_cut_off_dic[user_cut_off_map[key]] = []
    for item in value:
        graphs_cut_off_dic[user_cut_off_map[key]].append(user_cut_off_map[item])

# doing graphs_cut_off_dic    
graphs_len = len(group_list)
print(0.8 * graphs_len, )
graphs_test = group_list[int(0.8 * graphs_len):]
graphs_train = group_list[0:int(0.8*graphs_len)]
graphs_validate = group_list[0:int(0.2*graphs_len)]
features_train = []
labels_train = []
for list_idx, node_list  in enumerate(graphs_train):
    for node_idx, node_id  in enumerate(node_list):
        if node_id-1 in user_cut_off_map:            
            features_train.append(features_cut_off_list[user_cut_off_map[node_id-1]])
            labels_train.append(labels_cut_off_list[user_cut_off_map[node_id-1]])

features_test = []
labels_test = []
test_index = []
for list_idx, node_list  in enumerate(graphs_test):
    for node_idx, node_id  in enumerate(node_list):
        if node_id-1 in user_cut_off_map:            
            features_test.append(features_cut_off_list[user_cut_off_map[node_id-1]])
            labels_test.append(labels_cut_off_list[user_cut_off_map[node_id-1]])
            test_index.append(user_cut_off_map[node_id-1])
print(len(features_train),' ', len(features_test))
yelp_x = csr_matrix(np.array(features_train))
yelp_tx = csr_matrix(np.array(features_test))
yelp_allx = csr_matrix(np.array(features_cut_off_list))

yelp_y = np.array(labels_train)
yelp_ty = np.array(labels_test)
yelp_ally = np.array(labels_cut_off_list)
graph_dic = defaultdict(list, graphs_cut_off_dic)

pkl.dump(yelp_x,open('ind.{}.x'.format(name),'wb'))
pkl.dump(yelp_tx,open('ind.{}.tx'.format(name),'wb'))
pkl.dump(yelp_x,open('ind.{}.allx'.format(name),'wb'))

pkl.dump(yelp_y,open('ind.{}.y'.format(name),'wb'))
pkl.dump(yelp_ty,open('ind.{}.ty'.format(name),'wb'))
pkl.dump(yelp_y,open('ind.{}.ally'.format(name),'wb'))

pkl.dump(graph_dic,open('ind.{}.graph'.format(name),'wb'))
index_file = open('ind.{}.test.index'.format(name),'w')
for item in test_index:    
    index_file.write(str(item) + '\n')
index_file.close()
# pkl.dump(test_index,open('ind.{}.test.index'.format(name),'wb'))



1756.0
2631   659


In [2]:

import pickle as pkl
import sys
import scipy.sparse as sp
dataset_str = 'Yelp'

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
    with open("ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
        if sys.version_info > (3, 0):
            objects.append(pkl.load(f, encoding='latin1'))
        else:
            objects.append(pkl.load(f))
            
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]

idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)

train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])

y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]

In [3]:
import scipy.sparse
sparse_matrix = scipy.sparse.csc_matrix(np.array([[0, 0, 3], [4, 0, 0]]))
print(sparse_matrix)

  (1, 0)	4
  (0, 2)	3


NameError: name 'tf' is not defined