In [18]:
import os
from collections import defaultdict
import numpy as np
import pickle
import networkx as nx
from scipy import sparse

In [2]:
dataset_name = 'chameleon'

graph_adjacency_list_file_path = os.path.join('new_data', dataset_name, 'out1_graph_edges.txt')
graph_node_features_and_labels_file_path = os.path.join('new_data', dataset_name,
                                                                f'out1_node_feature_label.txt')

In [3]:
# ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object;

graph_dict = defaultdict(list)
with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
    graph_adjacency_list_file.readline()
    for line in graph_adjacency_list_file:
        line = line.rstrip().split('\t')
        assert (len(line) == 2)
        graph_dict[int(line[0])].append(int(line[1]))
        graph_dict[int(line[1])].append(int(line[0]))

# print(sorted(graph_dict))
graph_dict_ordered = defaultdict(list)
for key in sorted(graph_dict):
    graph_dict_ordered[key] = graph_dict[key]
    graph_dict_ordered[key].sort()

adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph_dict_ordered))

In [19]:
adj = sparse.csr_matrix(adj)
type(adj)

scipy.sparse.csr.csr_matrix

In [20]:
adj.shape

(2277, 2277)

In [4]:
# ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
# ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
# ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
#     (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
# ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
# ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
# ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;


graph_node_features_dict = {}
graph_labels_dict = {}

if dataset_name == 'film':
    with open(graph_node_features_and_labels_file_path) as graph_node_features_and_labels_file:
        graph_node_features_and_labels_file.readline()
        for line in graph_node_features_and_labels_file:
            line = line.rstrip().split('\t')
            assert (len(line) == 3)
            assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict)
            feature_blank = np.zeros(932, dtype=np.uint8)
            feature_blank[np.array(line[1].split(','), dtype=np.uint16)] = 1
            graph_node_features_dict[int(line[0])] = feature_blank
            graph_labels_dict[int(line[0])] = int(line[2])
else:
    with open(graph_node_features_and_labels_file_path) as graph_node_features_and_labels_file:
        graph_node_features_and_labels_file.readline()
        for line in graph_node_features_and_labels_file:
            line = line.rstrip().split('\t')
            assert (len(line) == 3)
            assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict)
            graph_node_features_dict[int(line[0])] = np.array(line[1].split(','), dtype=np.uint8)
            graph_labels_dict[int(line[0])] = int(line[2])

In [23]:
features_list = []
for key in sorted(graph_node_features_dict):
    features_list.append(graph_node_features_dict[key])
features = np.vstack(features_list)

In [24]:
type(features)

numpy.ndarray

In [25]:
features.shape

(2277, 2325)

In [5]:
splits_file_path = 'splits/chameleon_split_0.6_0.2_0.npz'

with np.load(splits_file_path) as splits_file:
    train_mask = splits_file['train_mask']
    val_mask = splits_file['val_mask']
    test_mask = splits_file['test_mask']

In [22]:
np.array(train_mask, dtype=np.bool)

array([False,  True, False, ..., False, False, False])

In [7]:
labels = []
for key in graph_labels_dict.keys():
    labels.append(graph_labels_dict[key])

label_classes = max(labels) + 1
print('label classes', label_classes)
zeros_array = np.zeros(label_classes)

label classes 5


In [8]:
train_features_list, valid_features_list, test_features_list = [], [], []
train_labels_list, valid_labels_list, test_labels_list = [], [], []

for i in range(train_mask.shape[0]):
    if train_mask[i] == 1:
        train_features_list.append(graph_node_features_dict[i])
        train_labels_list.append(graph_labels_dict[i])

for i in range(val_mask.shape[0]):
    if val_mask[i] == 1:
        valid_features_list.append(graph_node_features_dict[i])
        valid_labels_list.append(graph_labels_dict[i])

for i in range(test_mask.shape[0]):
    if test_mask[i] == 1:
        test_features_list.append(graph_node_features_dict[i])
        test_labels_list.append(graph_labels_dict[i])


In [9]:
train_labels = np.eye(label_classes)[train_labels_list]
valid_labels = np.eye(label_classes)[valid_labels_list]
test_labels = np.eye(label_classes)[test_labels_list]

print('train labels shape', train_labels.shape, 'valid labels shape', valid_labels.shape, 'test labels shape', test_labels.shape)

train labels shape (1092, 5) valid labels shape (729, 5) test labels shape (456, 5)


In [10]:
type(train_labels)

numpy.ndarray

In [11]:
train_features = np.vstack(train_features_list)
valid_features = np.vstack(valid_features_list)
test_features = np.vstack(test_features_list)

print('train features shape', train_features.shape, 'valid features shape', valid_features.shape, 'test features shape', test_features.shape)

train features shape (1092, 2325) valid features shape (729, 2325) test features shape (456, 2325)
