# Sensitivity Analysis

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../data')

from utils import (init_results, 
                            ObjectsSelector,
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            get_binary_distances_choice,
                            experiment_risf_complex)

import numpy as np
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov, NetLSD
from risf.distance_functions import *
from risf.distance import SelectiveDistance
from pathlib import Path
import pandas as pd

np.random.seed(SEED)

CLF = "RISF"
PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "binary" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/")
}

In [2]:
def test_one_parameter(datasets, distances_func, param_name=None, options=None, classifiers = ["RISF", "IForest"]):
    # To allow default parameters run
    if param_name is None:
        param_name="default"
    if options is None:
        options = [1]

    results_all = []
    for clf in classifiers:
        print(clf)
        for data_type in datasets.keys():
            print(data_type)
            for dataset_name in datasets[data_type]:
                for option in options:
                    clf_kwargs = dict() if param_name == "default" else {param_name: option}
                    
                    data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                    if "fixed" in param_name:
                        if isinstance(option, float):
                            option = int(option * data["X"].shape[1])
                        
                        min_n = 1
                        if distances_func in [cosine_projection, chebyshev_projection]:
                            min_n = 2
                            if option == 1:
                                continue
                    
                        clf_kwargs["max_n"] = option
                        if "not_fixed" in param_name:
                            distances = [SelectiveDistance(distances_func, min_n, option)]
                        else:
                            distances = [SelectiveDistance(distances_func, option, option)]
                    else:
                        distances = distances_func(data["X"], data_type)
                    
                    if clf != "RISF":
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs)
                    else:
                        if data_type in ["numerical", "nlp", "timeseries"]:
                            aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                        else:     
                            aucs = experiment_risf_complex(data, distances, selected_obj_ratio=0.5, clf_kwargs=clf_kwargs)
                            
                    results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                    results_all.extend(results)

    pd.DataFrame(results_all).to_csv(f"../results/{param_name}.csv", index=False)


#### At first lets verify what is the best vector size for numerical datasets

In [28]:
DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}
distances = [manhattan_projection, cosine_projection, chebyshev_projection]
fixed_size = [True, False]
max_n = [1, 2, 3, 5, 0.25, 0.5, 0.75, 1.0]
for distance in distances:
    for fixed in fixed_size:
        param_name = distance.__name__ + "-" + ("fixed" if fixed else "not_fixed") + "-max_n"
        print(param_name)
        if fixed and distance ==  manhattan_projection:
            continue
        test_one_parameter(DATASETS, distance, param_name, max_n, classifiers=["RISF"])

manhattan_projection-fixed-max_n
manhattan_projection-not_fixed-max_n
RISF
numerical
timeseries
nlp
cosine_projection-fixed-max_n
RISF
numerical
timeseries
nlp
cosine_projection-not_fixed-max_n
RISF
numerical
timeseries
nlp
chebyshev_projection-fixed-max_n
RISF
numerical
timeseries
nlp
chebyshev_projection-not_fixed-max_n
RISF
numerical
timeseries
nlp


#### Influence of using different distance functions

In [3]:
DATASETS = {
    #"numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "graph":  ["COX2", "BZR", "DHFR"],
    #"binary":  ["ad_nominal"],
    #"timeseries": ["TwoLeadECG"],
    #"nlp" : ["agnews_1"]
}

OBJ_RATIO = 1

def get_distances_risf(X, data_type) -> np.ndarray:
    if data_type == "graph":
        return np.array([GraphDist(IpsenMikhailov), GraphDist(PortraitDivergence), SelectiveDistance(BagOfWord_projection, 1, 1),
                    GraphDist(NetSimile), GraphDist(DegreeDivergence), GraphDist(NetLSD), NumericalReprDist()], dtype=object)
    elif data_type == "numerical":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3), 
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(cosine_projection, 2, 3),
                        SelectiveDistance(chebyshev_projection, 2, 3)], dtype=object)
    elif data_type == "binary":
        return np.array([SelectiveDistance(dummy_projection,1,1), LinDist(X), Goodall3Dist(X), OFDist(X)], dtype=object)
    elif data_type == "nlp":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(chebyshev_projection, 5, 10)], dtype=object)
    elif data_type == "timeseries":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(chebyshev_projection, 5, 10),
                        DTWDist()], dtype=object)

In [4]:
def get_shortcut(dist):
    if isinstance(dist, SelectiveDistance):
        return dist.projection_func.__name__[:4]
    if isinstance(dist, GraphDist):
        return dist.distance.__class__.__name__[:4]
    return dist.__class__.__name__[:4]

for data_type in DATASETS.keys():
    results_distance_comb = []
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
        distances = get_distances_risf(data["X"], data_type)
        distances_to_use = get_binary_distances_choice(distances)
        for dist_to_use in distances_to_use:
            dist = list(distances[dist_to_use])
            if dist_to_use.sum() > 3:
                continue
            if data_type in ["graph", "binary", "timeseries"]:
                aucs =  experiment_risf_complex("RISF", data, dist, OBJ_RATIO, clf_kwargs={})
                dist_short = '_'.join([get_shortcut(x) for x in dist])                
            else:
                aucs = perform_experiment_simple("RISF", data, clf_kwargs={}, distances = [dist])
                dist_short = '_'.join([x.projection_func.__name__[:4] for x in dist])
            
            
            results = init_results(CLF, dataset_name, data_type, aucs, {"distances": dist_short})
            
            results_distance_comb.extend(results)

    pd.DataFrame(results_distance_comb).to_csv(f"../results/results_selected_distances_{data_type}.csv", index=False)

#pd.DataFrame(results_distance_comb).to_csv("../results/results_selected_distances.csv", index=False)


graph


In [7]:
# Now i should have defaults for distances
def get_distances_risf(X, data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence), ManhattanDist(), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()]
    elif data_type == "numerical":
        return [SelectiveDistance(manhattan_projection, 1, 3), SelectiveDistance(euclidean_projection, 1, 3)]
    elif data_type == "binary":
        return [SelectiveDistance(dummy_projection,1,1), LinDist(X), Goodall3Dist(X)]
    elif data_type == "nlp":
        return [SelectiveDistance(manhattan_projection, 1, 3),SelectiveDistance(euclidean_projection, 1, 3), 
                SelectiveDistance(chebyshev_projection, 5, 10)]
    elif data_type == "timeseries":
        return [SelectiveDistance(manhattan_projection, 1, 3), SelectiveDistance(chebyshev_projection, 5, 10),
                DTWDist()]

#### Now test influence of selected objects ratio

In [8]:
SELECTED_OBJ_RATIO = [0.1, 0.25, 0.5, 0.75, 1.0]
results_selected_obj = []

for data_type in DATASETS.keys():
    if data_type in ["numerical", "nlp"]:
        continue # they don't use complex distances
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        objects_selector = ObjectsSelector() # TBD
        for obj_ratio in SELECTED_OBJ_RATIO:               
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
            
            aucs = experiment_risf_complex(data, get_distances_risf(data["X"], data_type), obj_ratio, selection_func = objects_selector, clf_kwargs= {})
            results = init_results(CLF, dataset_name, data_type, aucs, {"obj_ratio": obj_ratio})
            results_selected_obj.extend(results)

pd.DataFrame(results_selected_obj).to_csv("../results/results_selected_obj_Ratio.csv", index=False)

graph
binary
timeseries


#### Influence of subsampling_size

In [13]:
import warnings
warnings.filterwarnings("ignore") # There are few very small datasets for which this doesn't change max_samples

max_samples = np.array([64, 128, 256, 512])
test_one_parameter(DATASETS, get_distances_risf, "max_samples", max_samples)

RISF
numerical
graph
binary
timeseries
nlp
IForest
numerical
graph
binary
timeseries
nlp


### Influence of number of trees

In [8]:
num_of_estimators = np.array([1, 5, 10, 25, 50, 100, 200, 300]) # Start from 1 to 100. In general smaller numbers
test_one_parameter(DATASETS, get_distances_risf, "n_estimators", num_of_estimators) 

RISF
numerical
graph
binary
timeseries
nlp
IForest
numerical
graph
binary
timeseries
nlp


In [5]:
import pickle

In [6]:

best_distances = pickle.load(open("../best_distances/ISF_NCI1.pickle", "rb"))

In [11]:
best_distances[0][0][1]

[<risf.distance.SelectiveDistance at 0x1fc1a9c5970>]

In [7]:
best_distances

{0: [(0.5079333333333333,
   [<risf.distance_functions.GraphDist at 0x1f3d661faf0>]),
  (0.5023000000000001, [<risf.distance_functions.GraphDist at 0x1f3d661fb20>]),
  (0.49269999999999997,
   [<risf.distance_functions.NumericalReprDist at 0x1f3d661f760>]),
  (0.4726666666666666,
   [<risf.distance_functions.BagOfWordDist at 0x1f3d661f6a0>]),
  (0.4588666666666667,
   [<risf.distance_functions.GraphDist at 0x1f3d661f8b0>])],
 1: [(0.5318666666666666,
   [<risf.distance_functions.BagOfWordDist at 0x1f3d661f880>]),
  (0.5198666666666667, [<risf.distance_functions.GraphDist at 0x1f3d661fc40>]),
  (0.5077666666666666,
   [<risf.distance_functions.NumericalReprDist at 0x1f3d661fd60>]),
  (0.5018333333333334, [<risf.distance_functions.GraphDist at 0x1f3d661fe20>]),
  (0.43706666666666666,
   [<risf.distance_functions.GraphDist at 0x1f3d661ff10>])],
 2: [(0.5675, [<risf.distance_functions.GraphDist at 0x1f3d661ffd0>]),
  (0.5039333333333333,
   [<risf.distance_functions.BagOfWordDist at 0x1f3