In [1]:
import sys
sys.path.insert(0, '../data')

from rsif.distance_functions import GraphDist, EuclideanDist, ManhattanDist, BagOfWordDist, NumericalReprDist
from rsif.distance import TestDistanceMixin, TrainDistanceMixin
from data_getter import get_glocalkd_dataset
import numpy as np
from pathlib import Path
import pickle
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov,  JaccardDistance, NetLSD
%load_ext autoreload

%autoreload 2

In [2]:
DATA_DIR = "../data/graph"
PRECOMPUTED_DISTANCES_PATH = Path("../precomputed_distances")
PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

In [3]:
def precompute_graph_datasets(distances, dataset_list):
    for dataset_name in dataset_list:
        print(dataset_name)
        data = get_glocalkd_dataset(DATA_DIR, dataset_name)
        for distance in distances:
            print(distance.__name__)
            graph_distance = GraphDist(distance)
            X = data["X_graph"]
            entire_distance = TrainDistanceMixin(graph_distance)
            entire_distance.precompute_distances(X, n_jobs=-3)
            pickle.dump(entire_distance, open(PRECOMPUTED_DISTANCES_PATH / f"{dataset_name}_{distance.__name__}_0.pickle", 'wb'))


        (PRECOMPUTED_DISTANCES_PATH / f"{dataset_name}_{EuclideanDist.__name__}_0.pickle").unlink()
        (PRECOMPUTED_DISTANCES_PATH / f"{dataset_name}_{ManhattanDist.__name__}_0.pickle").unlink()

        entire_distance = TrainDistanceMixin(NumericalReprDist())
        entire_distance.precompute_distances(data["X_num"], n_jobs=-3)
        pickle.dump(entire_distance, open(PRECOMPUTED_DISTANCES_PATH / f"{dataset_name}_{NumericalReprDist.__name__}_0.pickle", 'wb'))

        entire_distance = TrainDistanceMixin(BagOfWordDist())
        entire_distance.precompute_distances(data["X"], n_jobs=-3)
        pickle.dump(entire_distance, open(PRECOMPUTED_DISTANCES_PATH / f"{dataset_name}_{BagOfWordDist.__name__}_0.pickle", 'wb'))
        

In [None]:
#! This cell make take particularly long time but it's just for sensitivity analysis
precompute_graph_datasets(["COX2", "BZR", "DHFR"], [NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov,  JaccardDistance, NetLSD])

In [9]:
#For experiments
precompute_graph_datasets(["NCI1", "AIDS", "ENZYMES", "PROTEINS_full", "DD"], [PortraitDivergence, DegreeDivergence, NetLSD]) 

BZR
DD
COX2
DHFR
ENZYMES
NCI1
PROTEINS_full


Categorical data

In [4]:
from data_getter import get_categorical_dataset
from pathlib import Path
import Categorical_similarity_measures as csm
from rsif.distance_functions import LinDist, Goodall3Dist, OFDist 

In [5]:
def precalculate_for_categorical(datasets_name, distances):
    for dataset in datasets_name:
        print(dataset)
        data = get_categorical_dataset(Path(f"../data/categorical/{dataset}.csv"), clf="RSIF")["X_cat"]
        for dist_class in distances:
            print(dist_class.__name__)
            dist = dist_class(data)
        
            entire_distance = TrainDistanceMixin(dist)
            entire_distance.precompute_distances(data, n_jobs=-3)
            pickle.dump(entire_distance, open(PRECOMPUTED_DISTANCES_PATH / f"{dataset}_{dist_class.__name__}_0.pickle", 'wb'))

In [5]:
precalculate_for_categorical(["ad_nominal", "AID362red_train_allpossiblenominal", "apascal_entire_trainvsall", "cmc-nominal", "Reuters-corn-100", "solar-flare_FvsAll-cleaned"], [LinDist, Goodall3Dist, OFDist])

AID362red_train_allpossiblenominal
OFDist
apascal_entire_trainvsall
OFDist
cmc-nominal
OFDist
Reuters-corn-100
OFDist
solar-flare_FvsAll-cleaned
OFDist


In [18]:
from data_getter import get_sets_data
def precalculate_for_sets(datasets_name):
    for dataset in datasets_name:
        print(dataset)
        data = get_sets_data(Path(f"../data/mixed"), dataset, for_rsif = True)
        entire_distance = TrainDistanceMixin(ManhattanDist())
        entire_distance.precompute_distances(data["X"][2], n_jobs=-3) #2nd element of tuple is a array of sets lengths
        print(entire_distance.distance_matrix)
precalculate_for_sets(["items", "length", "order"])

items
length
order
