In [2]:
import pandas as pd
import numpy as np
import string
import random
# import ogb
# from ogb.nodeproppred import NodePropPredDataset
import collections
import os
os.environ["DATASET_DIR"] = "/Users/rustamwarwick/Documents/Warwick/d3-gnn/datasets"

# Tag-Ask-Ubuntu

In [3]:
class TagAskUbuntu:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-simplices.txt"), header=None)[0].values
        n_labels = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-node-labels.txt"), header=None, delimiter=" ", usecols=[1])[1].values
        simplex_labels = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu-simplex-labels.txt"), header=None, delimiter=" ")[0].values
        self.q2t = collections.defaultdict(list) # simplex -> [nodes]
        self.t2q = collections.defaultdict(list) # nodes -> [simplex]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            s = str(simplex_labels[simplex_idx])
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                n_label = n_labels[j-1]
                self.t2q[n_label].append(s)
                self.q2t[s].append(n_label)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{key},{",".join(val)}\n')
        create_file(self.q2t, os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu[question-tag].txt"))
        create_file(self.t2q, os.path.join(os.environ["DATASET_DIR"], "tags-ask-ubuntu","tags-ask-ubuntu[tag-question].txt"))
        
    def generate_statistics(self):
        num_nodes = len(self.t2q)
        num_hyperedges = len(self.q2t)
        
        

In [7]:
acc = 0unique i,j in res.iterrows():
    acc += res[:i][res[:i][0] == j[1]].shape[0]
acc = acc / res.shape[0]
acc, res.groupby(0).count()[1].mean()

(121.25355857915069, 10.28464271614686)

# DBLP 

In [63]:
class DBLP:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full-simplices.txt"), header=None)[0].values
        self.p2a = collections.defaultdict(list) # simplex[publication] -> nodes[author]
        self.a2p = collections.defaultdict(list) # nodes[author] -> simplex[publication]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            ids = "h"+str(simplex_idx)
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                self.a2p[j].append(ids)
                self.p2a[ids].append(j)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{str(key)},{",".join(map(str, val))}\n')
        create_file(self.p2a, os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full[publication-author].txt"))
        create_file(self.a2p, os.path.join(os.environ["DATASET_DIR"], "coauth-DBLP-full","coauth-DBLP-full[author-publication].txt"))
        
    def generate_statistics(self):
        return len(self.p2a),len(self.a2p)
        

In [64]:
a = DBLP()

In [65]:
a.create_files()

# MAG-history

In [4]:
class MAGHistory:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-MAG-history","coauth-MAG-history-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "coauth-MAG-history","coauth-MAG-history-simplices.txt"), header=None)[0].values
        self.p2a = collections.defaultdict(list) # simplex[publication] -> nodes[author]
        self.a2p = collections.defaultdict(list) # nodes[author] -> simplex[publication]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            ids = "h"+str(simplex_idx)
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                self.a2p[j].append(ids)
                self.p2a[ids].append(j)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{str(key)},{",".join(map(str, val))}\n')
        create_file(self.p2a, os.path.join(os.environ["DATASET_DIR"], "coauth-MAG-history","coauth-DBLP-full[publication-author].txt"))
        create_file(self.a2p, os.path.join(os.environ["DATASET_DIR"], "coauth-MAG-history","coauth-MAG-history[author-publication].txt"))
        
    def generate_statistics(self):
        return len(self.p2a),len(self.a2p)
        

In [5]:
a = MAGHistory()

# Threads-Math-SX

In [11]:
class ThreadsMathSX:
    def __init__(self):
        n_vertices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "threads-math-sx","threads-math-sx-nverts.txt"), header=None)[0].values
        simplices = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "threads-math-sx","threads-math-sx-simplices.txt"), header=None)[0].values
        self.q2t = collections.defaultdict(list) # simplex[Question] -> nodes[Tag]
        self.t2q = collections.defaultdict(list) # nodes[Tag] -> simplex[Question]
        index = 0
        for simplex_idx in range(n_vertices.shape[0]):
            ids = "h"+str(simplex_idx)
            for j in simplices[index:index+n_vertices[simplex_idx]]:
                self.t2q[j].append(ids)
                self.q2t[ids].append(j)
            index+=n_vertices[simplex_idx]
            
    def create_files(self):
        def create_file(my_dict, destination):
            with open(destination,"w") as f:
                for key, val in my_dict.items():
                    f.write(f'{str(key)},{",".join(map(str, val))}\n')
        create_file(self.q2t, os.path.join(os.environ["DATASET_DIR"], "threads-math-sx","threads-math-sx[question-tag].txt"))
        create_file(self.t2q, os.path.join(os.environ["DATASET_DIR"], "threads-math-sx","threads-math-sx[tag-question].txt"))
        
    def generate_statistics(self):
        return len(self.p2a),len(self.a2p)
        

In [12]:
a = ThreadsMathSX()
a.create_files()

# OGB-Products

In [24]:
class OGBProducts:
    def __init__():
        dataset = NodePropPredDataset(name = "ogbn-products", root = 'dataset/')
        shuffled_topology = pd.DataFrame(dataset.graph['edge_index'].T).sample(frac=1)
        features = pd.DataFrame(dataset.graph["node_feat"])
        labels = pd.DataFrame(dataset.labels)
    def save():
        shuffled_topology.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","edges.csv"), header=None, index=False)
        features.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","node_features.csv"), header=None)
        labels.to_csv(os.path.join(os.environ["DATASET_DIR"], "ogb-products","node_labels.csv"), header=None)

# Reddit Hyperlinks

In [28]:
class RedditHyperlinks:
    def __init__(self):
            self.dataset_body = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-body.tsv"), header=None)
            self.dataset_title = pd.read_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-title.tsv"), header=None)
    def save(self):
        self.dataset_body.to_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-body.tsv"),sep='\t', index=False, header=False)
        self.dataset_title.to_csv(os.path.join(os.environ["DATASET_DIR"], "RedditHyperlinks", "soc-redditHyperlinks-title.tsv"),sep='\t', index=False, header=False)

In [29]:
a = RedditHyperlinks()

  if await self.run_code(code, result, async_=asy):


In [30]:
a.save()