In [2]:
import pandas as pd
import numpy as np
import string
import random
from scipy.io import loadmat
# import ogb
# from ogb.nodeproppred import NodePropPredDataset
import collections
import os
os.environ["DATASET_DIR"] = "/Users/rustamwarwick/Documents/Warwick/d3-gnn/datasets"

# Tag-Ask-Ubuntu 3029 tags, 271233 simplexes, 1468584 star-expansion

In [4]:
class TagAskUbuntu:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-simplices.txt"), header=None)[0].values
        n_labels = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-node-labels.txt"), header=None, delimiter=" ", usecols=[1])[1].values
        simplex_labels = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-simplex-labels.txt"), header=None, delimiter=" ")[0].values
        self.q2t = collections.defaultdict(list) # simplex -> [nodes]
        self.t2q = collections.defaultdict(list) # nodes -> [simplex]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            s = str(simplex_labels[simplex_idx])
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                n_label = n_labels[j-1]
                self.t2q[n_label].append(s)
                self.q2t[s].append(n_label)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{key},{",".join(val)}\n')
        create_file(self.q2t, os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu[question-tag].txt"))
        create_file(self.t2q, os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu[tag-question].txt"))
        
    def generate_statistics(self):
        num_nodes = len(self.t2q)
        num_hyperedges = len(self.q2t)
        

In [7]:
acc = 0unique i,j in res.iterrows():
    acc += res[:i][res[:i][0] == j[1]].shape[0]
acc = acc / res.shape[0]
acc, res.groupby(0).count()[1].mean()

(121.25355857915069, 10.28464271614686)

# DBLP 

In [63]:
class DBLP:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full-simplices.txt"), header=None)[0].values
        self.p2a = collections.defaultdict(list) # simplex[publication] -> nodes[author]
        self.a2p = collections.defaultdict(list) # nodes[author] -> simplex[publication]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            ids = "h"+str(simplex_idx)
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                self.a2p[j].append(ids)
                self.p2a[ids].append(j)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{str(key)},{",".join(map(str, val))}\n')
        create_file(self.p2a, os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full[publication-author].txt"))
        create_file(self.a2p, os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full[author-publication].txt"))
        
    def generate_statistics(self):
        return len(self.p2a),len(self.a2p)
        

In [64]:
a = DBLP()

In [65]:
a.create_files()

# OGB-Products

In [24]:
class OGBProducts:
    def __init__():
        dataset = NodePropPredDataset(name = "ogbn-products", root = 'dataset/')
        shuffled_topology = pd.DataFrame(dataset.graph['edge_index'].T).sample(frac=1)
        features = pd.DataFrame(dataset.graph["node_feat"])
        labels = pd.DataFrame(dataset.labels)
    def save():
        shuffled_topology.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","edges.csv"), header=None, index=False)
        features.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","node_features.csv"), header=None)
        labels.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","node_labels.csv"), header=None)

# Reddit Hyperlinks

In [28]:
class RedditHyperlinks:
    def __init__(self):
            self.dataset_body = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-body.tsv"), header=None)
            self.dataset_title = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-title.tsv"), header=None)
    def save(self):
        self.dataset_body.to_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-body.tsv"),sep='\t', index=False, header=False)
        self.dataset_title.to_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-title.tsv"),sep='\t', index=False, header=False)

In [29]:
a = RedditHyperlinks()

  if await self.run_code(code, result, async_=asy):


In [30]:
a.save()

# DGraph

In [24]:
npzarr = np.load(os.path.join(os.environ["DATASET_DIR"], "DGraphFin", "dgraphfin.npz"))

In [72]:
def create_edge_list_file(npzarr):
    order = npzarr['edge_timestamp'].argsort()
    edges_ts = np.hstack((npzarr['edge_index'][order], npzarr['edge_timestamp'][order].reshape((1, -1)).T))
    pd.DataFrame(edges_ts).to_csv(os.path.join(os.environ["DATASET_DIR"], "DGraphFin", "edge-list.csv"), index=False, header=False)

In [81]:
def create_feature_and_labels(npzarr):
    np.save(os.path.join(os.environ["DATASET_DIR"], "DGraphFin", "node_features"), npzarr['x'].astype("float32"))
    labels = pd.DataFrame(npzarr['y'])
    labels = labels[(labels[0] == 0) | (labels[0] == 1)]
    labels.to_csv(os.path.join(os.environ["DATASET_DIR"], "DGraphFin", "node_labels.csv"), index=True, header=False)

In [82]:
create_feature_and_labels(npzarr)

In [80]:
npzarr['x'].astype("float32")

array([[ 0.,  5., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [ 0.,  5., -1., ..., -1., -1., -1.],
       ...,
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.]], dtype=float32)

In [None]:
npzarr['x'].astype

In [56]:
order = npzarr['edge_timestamp'].argsort()

In [67]:
npzarr['edge_timestamp'][order].T.shape

(4300999,)

array([[2229506, 1955006,       1],
       [1260099, 2257207,       1],
       [2014056,  164472,       1],
       ...,
       [ 697121,  311084,     821],
       [2117711, 3684148,     821],
       [2036575,  463466,     821]])

In [35]:
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700545,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3700546,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3700547,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3700548,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [36]:
npzarr["x"][-1]

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  0., -1., -1.,
       -1., -1., -1., -1.])

In [31]:
npzarr['edge_index']

array([[ 476699, 2915516],
       [ 347800, 1271242],
       [ 154317, 2104635],
       ...,
       [1894383, 1147595],
       [1895741, 1314434],
       [1206795, 2072636]])