In [1]:
import numpy as np
import networkx as nx
import os
import pandas as pd

In [2]:
COMPLEX_DATA_PATH = "../data/complex/"

In [3]:
def tud_to_networkx(data_folder, ds_name):
    with open(data_folder + ds_name + "/" + ds_name + "_graph_indicator.txt", "r") as f:
        graph_indicator = [int(i) - 1 for i in list(f)]
    f.closed

    # Nodes.
    num_graphs = max(graph_indicator)
    node_indices = []
    offset = []
    c = 0

    for i in range(num_graphs + 1):
        offset.append(c)
        c_i = graph_indicator.count(i)
        node_indices.append((c, c + c_i - 1))
        c += c_i

    graph_db = []
    for i in node_indices:
        g = nx.Graph()
        for j in range(i[1] - i[0] + 1):
            g.add_node(j)

        graph_db.append(g)

    # Edges.
    with open(data_folder + ds_name + "/" + ds_name + "_A.txt", "r") as f:
        edges = [i.split(',') for i in list(f)]
    f.closed

    edges = [(int(e[0].strip()) - 1, int(e[1].strip()) - 1) for e in edges]
    edge_list = []
    edgeb_list = []
    for e in edges:
        g_id = graph_indicator[e[0]]
        g = graph_db[g_id]
        off = offset[g_id]

        # Avoid multigraph (for edge_list)
        if ((e[0] - off, e[1] - off) not in list(g.edges())) and ((e[1] - off, e[0] - off) not in list(g.edges())):
            g.add_edge(e[0] - off, e[1] - off)
            edge_list.append((e[0] - off, e[1] - off))
            edgeb_list.append(True)
        else:
            edgeb_list.append(False)

    # Node labels.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_node_labels.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_node_labels.txt", "r") as f:
            node_labels = [str.strip(i) for i in list(f)]
        f.closed

        node_labels = [i.split(',') for i in node_labels]
        int_labels = [];
        for i in range(len(node_labels)):
            int_labels.append([int(j) for j in node_labels[i]])

        i = 0
        for g in graph_db:
            for v in range(g.number_of_nodes()):
                g.nodes[v]['labels'] = int_labels[i]
                i += 1

    # Node Attributes.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_node_attributes.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_node_attributes.txt", "r") as f:
            node_attributes = [str.strip(i) for i in list(f)]
        f.closed

        node_attributes = [i.split(',') for i in node_attributes]
        float_attributes = [];
        for i in range(len(node_attributes)):
            float_attributes.append([float(j) for j in node_attributes[i]])
        i = 0
        for g in graph_db:
            for v in range(g.number_of_nodes()):
                g.nodes[v]['attributes'] = float_attributes[i]
                i += 1

    # Edge Labels.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_edge_labels.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_edge_labels.txt", "r") as f:
            edge_labels = [str.strip(i) for i in list(f)]
        f.closed

        edge_labels = [i.split(',') for i in edge_labels]
        e_labels = [];
        for i in range(len(edge_labels)):
            if (edgeb_list[i]):
                e_labels.append([int(j) for j in edge_labels[i]])

        i = 0
        for g in graph_db:
            for e in range(g.number_of_edges()):
                g.edges[edge_list[i]]['labels'] = e_labels[i]
                i += 1

    # Edge Attributes.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_edge_attributes.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_edge_attributes.txt", "r") as f:
            edge_attributes = [str.strip(i) for i in list(f)]
        f.closed

        edge_attributes = [i.split(',') for i in edge_attributes]
        e_attributes = [];
        for i in range(len(edge_attributes)):
            if (edgeb_list[i]):
                e_attributes.append([float(j) for j in edge_attributes[i]])

        i = 0
        for g in graph_db:
            for e in range(g.number_of_edges()):
                g.edges[edge_list[i]]['attributes'] = e_attributes[i]
                i += 1

    # Classes.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_graph_labels.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_graph_labels.txt", "r") as f:
            classes = [str.strip(i) for i in list(f)]
        f.closed
        classes = [i.split(',') for i in classes]
        cs = [];
        for i in range(len(classes)):
            cs.append([int(j) for j in classes[i]])

        i = 0
        for g in graph_db:
            g.graph['classes'] = cs[i]
            i += 1

    # Targets.
    if os.path.exists(data_folder + ds_name + "/" + ds_name + "_graph_attributes.txt"):
        with open(data_folder + ds_name + "/" + ds_name + "_graph_attributes.txt", "r") as f:
            targets = [str.strip(i) for i in list(f)]
        f.closed

        targets = [i.split(',') for i in targets]
        ts = []
        for i in range(len(targets)):
            ts.append([float(j) for j in targets[i]])

        i = 0
        for g in graph_db:
            g.graph['targets'] = ts[i]
            i += 1

    return graph_db

In [4]:
def graph_bagofwordize(graph_db):
    columns = max([max([graph.nodes[n]["labels"][0] for n in graph.nodes]) for graph in graph_db])

    result = np.zeros((len(graph_db), columns+1))

    for i, graph in enumerate(graph_db):
        for k, v in nx.get_node_attributes(graph, "labels").items():
            result[i, v[0]] += 1

    return result

In [27]:
graph_datasets = ["AIDS", "COX2"]
graph_datasets_dict = {"dataset": [], "examples": [], "min_nodes":[], "max_nodes": [], "min_edges":[], "max_edges": []}

for dataset_name in graph_datasets:
    graph_db = tud_to_networkx(COMPLEX_DATA_PATH, dataset_name)
    nodes = np.array([g.number_of_nodes() for g in graph_db])
    edges = np.array([g.number_of_edges() for g in graph_db])

    graph_datasets_dict["dataset"].append(dataset_name)
    graph_datasets_dict["examples"].append(len(graph_db))
    graph_datasets_dict["min_nodes"].append(np.min(nodes))
    graph_datasets_dict["max_nodes"].append(np.max(nodes))
    graph_datasets_dict["min_edges"].append(np.min(edges))
    graph_datasets_dict["max_edges"].append(np.max(edges))
                               
graph_stats_df = pd.DataFrame(graph_datasets_dict)
graph_stats_df

Unnamed: 0,dataset,examples,min_nodes,max_nodes,min_edges,max_edges
0,AIDS,2000,2,95,1,103
1,COX2,467,32,56,34,59


In [28]:
# outs = 0
# for g in graph_db:
#     x = g.graph['classes'].pop()
#     if x == -1:
#         outs += x
    
# print(outs / len(graph_db))

-0.7815845824411135


In [7]:
import pickle

In [8]:
for dataset_name in graph_datasets:
    graph_db = tud_to_networkx(COMPLEX_DATA_PATH, dataset_name)
    i = 0
    y = []
    for g in graph_db:
        nx.write_gpickle(g, COMPLEX_DATA_PATH + dataset_name + "_pickles/" + str(i) + ".pickle")
        y.append(g.graph['classes'][0])
        i += 1
    with open(COMPLEX_DATA_PATH + dataset_name + "_pickles/" + 'y_' + dataset_name + '.pickle', 'wb') as f:
        pickle.dump(y, f)
    

In [1]:
import sys
sys.path.insert(0, '..')
import data.data_getter

In [2]:
data_dir = data.data_getter.get_graphs()
data_dir["AIDS_pickles"]["X_train"]

[<networkx.classes.graph.Graph at 0x16733888cd0>,
 <networkx.classes.graph.Graph at 0x16733a39810>,
 <networkx.classes.graph.Graph at 0x1673420a290>,
 <networkx.classes.graph.Graph at 0x16732c5e620>,
 <networkx.classes.graph.Graph at 0x16730d028c0>,
 <networkx.classes.graph.Graph at 0x1671ee0be80>,
 <networkx.classes.graph.Graph at 0x16734a05a50>,
 <networkx.classes.graph.Graph at 0x16734098550>,
 <networkx.classes.graph.Graph at 0x167346d09d0>,
 <networkx.classes.graph.Graph at 0x16732fd2410>,
 <networkx.classes.graph.Graph at 0x16733a3aad0>,
 <networkx.classes.graph.Graph at 0x16733d88d90>,
 <networkx.classes.graph.Graph at 0x1673354ae90>,
 <networkx.classes.graph.Graph at 0x167343ccf10>,
 <networkx.classes.graph.Graph at 0x16732e05e10>,
 <networkx.classes.graph.Graph at 0x16733d8b310>,
 <networkx.classes.graph.Graph at 0x16733f21450>,
 <networkx.classes.graph.Graph at 0x1673334cd90>,
 <networkx.classes.graph.Graph at 0x16732fd2dd0>,
 <networkx.classes.graph.Graph at 0x16734a05510>,
