In [2]:
import sys
sys.path.insert(0, '..')
from risf.risf_data import RisfData
from risf.distance_functions import *
from data.data_getter import *
import networkx as nx
import numpy as np
from tqdm import tqdm
import netrd

from risf.forest import RandomIsolationSimilarityForest

In [3]:
graph_loop = tqdm(get_graphs(), desc="Datasets (outer loop)", position=0)
num_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)
hist_loop = tqdm(get_histograms(), desc="Datasets (outer loop)", position=0)
ts_loop = tqdm(get_time_series(), desc="Datasets (outer loop)", position=0)

Datasets (outer loop): 0it [00:00, ?it/s]

Numerical data - euclidean distance

In [11]:
try:
    num_loop = tqdm(get_numerical_datasets(), desc="Datasets (outer loop)", position=0)

    for dataset in num_loop:
        data = RisfData()
        
        data.add_data(dataset["X_train"], dist = NumEuclidean(), data_transform = lambda x: x)
        
        data.precompute_distances()
        print(data.distances[0].distance_matrix)
        
        # TODO: run RISF
        
        break        
except KeyboardInterrupt: pass 

Datasets (outer loop): 1it [00:00,  1.46it/s]

[[     0.  65036. 259087. ...  64535. 130054. 195084.]
 [ 65036.      0. 258575. ...  65029. 260100. 194566.]
 [259087. 258575.      0. ...  65044.  65033. 130057.]
 ...
 [ 64535.  65029.  65044. ...      0. 259085. 193551.]
 [130054. 260100.  65033. ... 259085.      0. 130052.]
 [195084. 194566. 130057. ... 193551. 130052.      0.]]





Graph data - Jaccard distance, NetSmile distance, Degree divergence distance

In [28]:
try: 
    graph_loop = tqdm(get_graphs(), desc="Datasets (outer loop)", position=0)

    for dataset in graph_loop:
        data = RisfData()
        
        data.add_data(dataset["X_train"], dist = JaccardDist(), data_transform = lambda x: x)
        
        data.precompute_distances()
        print(data.distances[0].distance_matrix)
        
        # TODO: run RISF
        
        break        
except KeyboardInterrupt: pass

In [29]:
print(data.distances[0].distance_matrix)

[[0.         0.82352941 0.82352941 ... 0.92857143 0.7        0.8125    ]
 [0.82352941 0.         0.8        ... 0.8125     0.9375     0.9047619 ]
 [0.82352941 0.8        0.         ... 0.8125     0.9375     0.72222222]
 ...
 [0.92857143 0.8125     0.8125     ... 0.         0.         0.        ]
 [0.7        0.9375     0.9375     ... 0.         0.         0.        ]
 [0.8125     0.9047619  0.72222222 ... 0.         0.         0.        ]]


Graph data - Euclidean distance using some of graph centrality measures

In [7]:
def graphCentralityMeasures(graph):
    centralityMeasures = []
    centralityMeasures.append(np.average(list(nx.degree_centrality(graph).values())))
    centralityMeasures.append(np.average(list(nx.katz_centrality(graph).values())))
    centralityMeasures.append(np.average(list(nx.closeness_centrality(graph).values())))
    centralityMeasures.append(np.average(list(nx.harmonic_centrality(graph).values())))
    
    return np.array(centralityMeasures)

In [8]:
try:
    graph_loop = tqdm(get_graphs(), desc="Datasets (outer loop)", position=0)

    for dataset in graph_loop:
        data = RisfData()
        
        graphsAsCentralityMeasures = []
        for graph in dataset["X_train"]:
            graphsAsCentralityMeasures.append(graphCentralityMeasures(graph))
        
        data.add_data(graphsAsCentralityMeasures, dist = NumEuclidean(), data_transform = lambda x: x)
        
        data.precompute_distances()
        print(data.distances[0].distance_matrix)
        
        # TODO: run RISF
        
        break        
except KeyboardInterrupt: pass 

  X = np.array(X)
Datasets (outer loop): 0it [00:08, ?it/s]

[[0.00000000e+00 8.44153572e-01 8.44153572e-01 ... 3.86317393e-01
  2.85186846e+00 6.85943694e-01]
 [8.44153572e-01 0.00000000e+00 6.16297582e-33 ... 2.36464767e+00
  6.78484002e+00 8.60072130e-03]
 [8.44153572e-01 6.16297582e-33 0.00000000e+00 ... 2.36464767e+00
  6.78484002e+00 8.60072130e-03]
 ...
 [3.86317393e-01 2.36464767e+00 2.36464767e+00 ... 0.00000000e+00
  1.14525075e+00 2.09524956e+00]
 [2.85186846e+00 6.78484002e+00 6.78484002e+00 ... 1.14525075e+00
  0.00000000e+00 6.31911980e+00]
 [6.85943694e-01 8.60072130e-03 8.60072130e-03 ... 2.09524956e+00
  6.31911980e+00 0.00000000e+00]]





Histogram data - Wasserstein distance, Euclidean distance

In [4]:
def toRawHistograms(dataset, key):
    raw_histograms = []
    for hist in dataset:
        raw_histograms.append(hist[key])
    
    return raw_histograms

In [29]:
try:
    hist_loop = tqdm(get_histograms(), desc="Datasets (outer loop)", position=0)

    for dataset in hist_loop:
        data = RisfData()
        
        data.add_data(toRawHistograms(dataset["X_train"], "n_degrees"), dist = WassersteinDist(), data_transform = lambda x: x)
        
        data.precompute_distances()
        print(data.distances[0].distance_matrix)
        
        # TODO: run RISF
        
        break        
except KeyboardInterrupt: pass

Datasets (outer loop): 0it [01:49, ?it/s]

[[0.         1.33333333 2.86666667 ... 4.         2.66666667 1.66666667]
 [1.33333333 0.         1.53333333 ... 3.33333333 2.53333333 2.13333333]
 [2.86666667 1.53333333 0.         ... 2.86666667 2.2        2.6       ]
 ...
 [4.         3.33333333 2.86666667 ... 0.         4.66666667 5.46666667]
 [2.66666667 2.53333333 2.2        ... 4.66666667 0.         1.4       ]
 [1.66666667 2.13333333 2.6        ... 5.46666667 1.4        0.        ]]





In [30]:
# print(data.distances[0].distance_matrix)

In [5]:
try:
    hist_loop = tqdm(get_histograms(), desc="Datasets (outer loop)", position=0)

    for dataset in hist_loop:
        data = RisfData()
        
        data.add_data(toRawHistograms(dataset["X_train"], "n_degrees"), dist = HistEuclidean(), data_transform = lambda x: x)
        
        data.precompute_distances()
        print(data.distances[0].distance_matrix)
        
        # TODO: run RISF
        
        break        
except KeyboardInterrupt: pass

Datasets (outer loop): 0it [00:08, ?it/s]
Datasets (outer loop): 0it [00:29, ?it/s]

[[ 0.  5. 44. ... 42. 54.  4.]
 [ 5.  0. 47. ... 35. 81. 11.]
 [44. 47.  0. ... 10. 38. 56.]
 ...
 [42. 35. 10. ...  0. 60. 62.]
 [54. 81. 38. ... 60.  0. 66.]
 [ 4. 11. 56. ... 62. 66.  0.]]





Time Series data - Cross Correlation distance, Jensen-Shannon Divergence distance, Euclidean distance

In [4]:
ts_loop = tqdm(get_time_series(), desc="Datasets (outer loop)", position=0)

for dataset in ts_loop:
    data = RisfData()
    
    concatenated = np.array([dataset["X_train"], dataset["X_test"]])
    data.add_data(concatenated, dist = CrossCorrelationDist(), data_transform = lambda x: x)

    data.precompute_distances()
    
    # TODO: run RISF
    
    break

Datasets (outer loop): 0it [00:18, ?it/s]
  concatenated = np.array([dataset["X_train"], dataset["X_test"]])
  super().append(np.array(transformed))
Datasets (outer loop): 0it [00:00, ?it/s]


In [44]:
print(data.distances[0].distance_matrix)

[[0.00000000e+00 2.12959089e+08]
 [2.12959089e+08 0.00000000e+00]]


In [5]:
ts_loop = tqdm(get_time_series(), desc="Datasets (outer loop)", position=0)

for dataset in ts_loop:
    data = RisfData()
    
    concatenated = np.array([dataset["X_train"], dataset["X_test"]])
    data.add_data(concatenated, dist = JensenShannonDivDist(), data_transform = lambda x: x)

    data.precompute_distances()
    
    print(data.distances[0].distance_matrix)
    
    # TODO: run RISF
    
    break

  concatenated = np.array([dataset["X_train"], dataset["X_test"]])
Datasets (outer loop): 0it [00:00, ?it/s]

[[ 0. inf]
 [inf  0.]]





In [4]:
ts_loop = tqdm(get_time_series(), desc="Datasets (outer loop)", position=0)

for dataset in ts_loop:
    data = RisfData()
    
    concatenated = np.array([dataset["X_train"], dataset["X_test"]])
    data.add_data(concatenated, dist = TSEuclidean(), data_transform = lambda x: x)

    data.precompute_distances()
    
    # TODO: run RISF
    
    break

Datasets (outer loop): 0it [00:04, ?it/s]
  concatenated = np.array([dataset["X_train"], dataset["X_test"]])
  super().append(np.array(transformed))
Datasets (outer loop): 0it [00:00, ?it/s]


In [5]:
print(data.distances[0].distance_matrix)

[[0.00000000e+00 2.27480438e+09]
 [2.27480438e+09 0.00000000e+00]]


In [3]:
data[0].shape

(326,)

In [4]:
data.distances[0].distance_matrix # Distances between all graphs

IndexError: list index out of range

In [7]:
edge_list = [
    [(1, 2), (1, 3), (2, 3), (3, 4)], #representation of 1 graph. e.g edge list, adj matrix, path to file, 
    [(1, 2), (1, 3), (2, 3)], # this can also be list of object on which you can use your distance function
    [(1, 2), (1, 3), (2, 3), (4, 5)],
    [(2, 1), (3, 1), (3, 2), (5, 4)],
    [(1, 2), (1, 3), (2, 3), (3, 4)],
    [(1, 2), (1, 3), (2, 3)],
    [(1, 2), (1, 3), (2, 3), (4, 5)],
    [(2, 1), (3, 1), (3, 2), (5, 4)],
    [(1, 2), (1, 3), (2, 3), (3, 4)],
    [(1, 2), (1, 3), (2, 3)],
    [(1, 2), (1, 3), (2, 3), (4, 5)],
    [(2, 1), (3, 1), (3, 2), (5, 4)]
]

test_edge_list = [
     [(1, 2), (1, 3), (2, 3), (3, 4)],
     [(25,30)]
]

def edge_list_to_networkx(edge_list):
    g = nx.Graph() # on this object I can calulate distance
    g.add_edges_from(edge_list)
    return g

vectors = np.random.rand(12,5) # 12 rows 5 columns

test_vectors = np.array([
    vectors[0],
    np.array([1000,1000,1000,1000,1000])
])

array([[0.42363088, 0.20807228, 0.0120376 , 0.84458506, 0.16594805],
       [0.84540164, 0.36490684, 0.5781003 , 0.14901948, 0.91097809],
       [0.24263709, 0.10878532, 0.33579733, 0.48678123, 0.55444676],
       [0.5903954 , 0.51851788, 0.82696006, 0.53977599, 0.10410998],
       [0.77999506, 0.12829252, 0.11351366, 0.07558671, 0.2988314 ],
       [0.04076404, 0.59255095, 0.7420449 , 0.98995102, 0.41004673],
       [0.66847445, 0.07501076, 0.33693187, 0.06449645, 0.93980331],
       [0.85941928, 0.7492603 , 0.9560178 , 0.39077434, 0.75227362],
       [0.12756264, 0.22042372, 0.86382024, 0.15234442, 0.36836284],
       [0.17365713, 0.41027356, 0.57997276, 0.59299755, 0.9401371 ],
       [0.37685277, 0.33705456, 0.69685201, 0.0013039 , 0.59854813],
       [0.40105161, 0.78630147, 0.13004601, 0.20077383, 0.05919401]])

In [7]:
data = RisfData()
#we need to add every column separately
data.add_data(edge_list , dist = netrd.distance.JaccardDistance(), data_transform = edge_list_to_networkx)
data.add_data(vectors, dist = lambda x,y: x@y)

data.precompute_distances() #precompute all distances

clf = RandomIsolationSimilarityForest(random_state=0, distance=data.distances).fit(data)

test_data = data.create_test_data([test_edge_list, test_vectors]) # we must create test data based on what classifier was trained on

clf.predict(test_data)

array([0, 0])

In [8]:
data.distances[0].distance_matrix # Distances between all graphs

array([[0.        , 0.25      , 0.4       , 0.66666667, 0.        ,
        0.25      , 0.4       , 0.66666667, 0.        , 0.25      ,
        0.4       , 0.66666667],
       [0.25      , 0.        , 0.25      , 0.6       , 0.25      ,
        0.        , 0.25      , 0.6       , 0.25      , 0.        ,
        0.25      , 0.6       ],
       [0.4       , 0.25      , 0.        , 0.66666667, 0.4       ,
        0.25      , 0.        , 0.66666667, 0.4       , 0.25      ,
        0.        , 0.66666667],
       [0.66666667, 0.6       , 0.66666667, 0.        , 0.66666667,
        0.6       , 0.66666667, 0.        , 0.66666667, 0.6       ,
        0.66666667, 0.        ],
       [0.        , 0.25      , 0.4       , 0.66666667, 0.        ,
        0.25      , 0.4       , 0.66666667, 0.        , 0.25      ,
        0.4       , 0.66666667],
       [0.25      , 0.        , 0.25      , 0.6       , 0.25      ,
        0.        , 0.25      , 0.6       , 0.25      , 0.        ,
        0.25      ,

In [9]:
data[0][5:11] # You can access elemets similarly as in numpy array

array([<networkx.classes.graph.Graph object at 0x00000191C3BF79A0>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF69B0>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6A10>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6A70>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6B30>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6E60>],
      dtype=object)

In [10]:
np.array(list(data[0][5:10]), dtype=object) # You can give this into the RisfData straightaway

array([<networkx.classes.graph.Graph object at 0x00000191C3BF79A0>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF69B0>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6A10>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6A70>,
       <networkx.classes.graph.Graph object at 0x00000191C3BF6B30>],
      dtype=object)