# Sensitivity Analysis

Definitions of constats and function that tests parameters

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '../data')

from utils import (init_results, 
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            N_REPEATED_HOLDOUT,
                            get_binary_distances_choice,
                            experiment_risf_complex,
                            ensure_existance,
                            check_precomputed_notebook)

check_precomputed_notebook()

import numpy as np
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov, NetLSD
from risf.distance_functions import *
from risf.distance import SelectiveDistance
from pathlib import Path
import pandas as pd

np.random.seed(SEED)

CLF = "RISF"
RESULTS_DIR = Path("../results")
RESULTS_DIR_MAX_N = RESULTS_DIR / "max_n"
SELECTED_DISTANCES_DIR = RESULTS_DIR / "selected_distances"
ensure_existance([RESULTS_DIR, SELECTED_DISTANCES_DIR, RESULTS_DIR_MAX_N])

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "binary" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/")
}

In [6]:
def test_one_parameter(datasets, distances_func, param_name=None, options=None, classifiers = ["RISF", "IForest"], 
                        custom_file_name = None, selected_obj_ratio = 1, n_holdouts = N_REPEATED_HOLDOUT):
    # To allow default parameters run
    if param_name is None:
        param_name="default"
    if options is None:
        options = [1]

    results_all = []
    for clf in classifiers:
        print(clf)
        for data_type in datasets.keys():
            print(data_type)
            for dataset_name in datasets[data_type]:
                for option in options:
                    clf_kwargs = dict() if param_name == "default" else {param_name: option}
                    
                    data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                    if "fixed" in param_name:
                        if isinstance(option, float):
                            option = int(option * data["X"].shape[1])
                        
                        min_n = 1
                        if distances_func in [cosine_projection, chebyshev_projection]:
                            min_n = 2
                            if option == 1:
                                continue
                    
                        clf_kwargs["max_n"] = option
                        if "not_fixed" in param_name:
                            distances = [SelectiveDistance(distances_func, min_n, option)]
                        else:
                            distances = [SelectiveDistance(distances_func, option, option)]
                    else:
                        distances = distances_func(data["X"], data_type)
                    
                    if clf not in ["RISF", "ISF"]:
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, n_holdouts=n_holdouts)
                    else:
                        aucs = experiment_risf_complex(clf, data, distances, selected_obj_ratio=selected_obj_ratio, clf_kwargs=clf_kwargs, n_holdouts=n_holdouts)
                            
                    results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                    results_all.extend(results)


    file_name = param_name if custom_file_name is None else custom_file_name
    if "fixed" in file_name:
        pd.DataFrame(results_all).to_csv( RESULTS_DIR_MAX_N /f"{file_name}.csv", index=False)
    else:
        pd.DataFrame(results_all).to_csv( RESULTS_DIR / f"{file_name}.csv", index=False)


Verify what is the best vector size for numerical, timeseries and NLP datasets

In [4]:
DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}
distances = [manhattan_projection, cosine_projection, chebyshev_projection]
fixed_size = [True, False]
max_n = [1, 2, 3, 5, 0.25, 0.5, 0.75, 1.0]
for distance in distances:
    for fixed in fixed_size:
        param_name = distance.__name__ + "-" + ("fixed" if fixed else "not_fixed")
        print(param_name)
        test_one_parameter(DATASETS, distance, param_name, max_n, classifiers=["RISF"])

Test influence of selected objects ratio. For this we shouldn't use Selective Distances, as they always compute projections on the fly. Distance selection is not important at this point.

In [3]:
def get_distances_risf(X, data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence), ManhattanDist()]
    elif data_type == "numerical":
        return [ManhattanDist(), EuclideanDist()]
    elif data_type == "binary":
        return [LinDist(X), Goodall3Dist(X)]
    elif data_type == "nlp":
        return [ChebyshevDist(), ManhattanDist()]
    elif data_type == "timeseries":
        return [DTWDist()]

DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "graph":  ["COX2", "BZR", "DHFR"],
    "binary":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

SELECTED_OBJ_RATIO = [0.1, 0.25, 0.5, 0.75, 1.0]
results_selected_obj = []

for data_type in DATASETS.keys():
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        for obj_ratio in SELECTED_OBJ_RATIO:               
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
            
            aucs = experiment_risf_complex("RISF", data, distances=get_distances_risf(data["X"], data_type), selected_obj_ratio=obj_ratio, clf_kwargs= {})
            results = init_results(CLF, dataset_name, data_type, aucs, {"obj_ratio": obj_ratio})
            results_selected_obj.extend(results)

pd.DataFrame(results_selected_obj).to_csv(RESULTS_DIR / "selected_obj_ratio.csv", index=False)

Influence of using different distance functions. We allow all distances to compete. Hyperparameters of `SelectiveDistances` were selected based on previous experiments. `OBJ_RATIO` was set based on previous experiments.

In [4]:
OBJ_RATIO = 0.5

def get_distances_risf(X, data_type):
    if data_type == "graph":
        return np.array([GraphDist(IpsenMikhailov), GraphDist(PortraitDivergence), SelectiveDistance(BagOfWord_projection, 1, 1),
                    GraphDist(NetSimile), GraphDist(DegreeDivergence), GraphDist(NetLSD), NumericalReprDist()], dtype=object)
    elif data_type == "numerical":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3), 
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(cosine_projection, 2, 3),
                        SelectiveDistance(chebyshev_projection, 2, 3)], dtype=object)
    elif data_type == "binary":
        return np.array([SelectiveDistance(dummy_projection,1,1), LinDist(X), Goodall3Dist(X), OFDist(X)], dtype=object)
    elif data_type == "nlp":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(chebyshev_projection, 5, 5),
                        SelectiveDistance(cosine_projection, 2, 2)], dtype=object)
    elif data_type == "timeseries":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(cosine_projection, 2, 2), 
                        SelectiveDistance(chebyshev_projection, 2, 2),
                        DTWDist()], dtype=object)

Compute default IF scores for comparison

In [8]:
test_one_parameter(DATASETS, get_distances_risf, param_name=None, options=None, classifiers=["IForest"], custom_file_name="if_default")

IForest
numerical
graph
binary
timeseries
nlp


Check all combinations of distances

In [10]:
def get_shortcut(dist):
    if isinstance(dist, SelectiveDistance):
        return dist.projection_func.__name__[:4]
    if isinstance(dist, GraphDist):
        return dist.distance.__class__.__name__[:4]
    return dist.__class__.__name__[:4]

for data_type in DATASETS.keys():
    results_distance_comb = []
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
        distances = get_distances_risf(data["X"], data_type)
        distances_to_use = get_binary_distances_choice(distances)
        for dist_to_use in distances_to_use:
            dist = list(distances[dist_to_use])
            if dist_to_use.sum() > 3:
                continue
            if data_type in ["graph", "binary", "timeseries"]:
                aucs =  experiment_risf_complex("RISF", data, dist, OBJ_RATIO, clf_kwargs={})
                dist_short = '_'.join([get_shortcut(x) for x in dist])                
            else:
                aucs = perform_experiment_simple("RISF", data, clf_kwargs={}, distances = dist)
                dist_short = '_'.join([x.projection_func.__name__[:4] for x in dist])
            
            
            results = init_results(CLF, dataset_name, data_type, aucs, {"distances": dist_short})
            
            results_distance_comb.extend(results)

    pd.DataFrame(results_distance_comb).to_csv(SELECTED_DISTANCES_DIR / f"{data_type}.csv", index=False)


numerical
graph
binary
timeseries
nlp


Influence of subsampling_size. All hyperparameters was set based on previous experiments.

In [3]:
def get_distances_risf(X, data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence), ManhattanDist(), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()]
    elif data_type == "numerical":
        return [SelectiveDistance(manhattan_projection, 1, 3), SelectiveDistance(euclidean_projection, 1, 3)]
    elif data_type == "binary":
        return [SelectiveDistance(dummy_projection,1,1), LinDist(X), Goodall3Dist(X)]
    elif data_type == "nlp":
        return [SelectiveDistance(manhattan_projection, 1, 3),SelectiveDistance(euclidean_projection, 1, 3), 
                SelectiveDistance(chebyshev_projection, 5, 5)]
    elif data_type == "timeseries":
        return [SelectiveDistance(manhattan_projection, 1, 3), SelectiveDistance(chebyshev_projection, 2, 2),
                DTWDist()]

In [13]:
import warnings
warnings.filterwarnings("ignore") # There are few very small datasets for which this doesn't change max_samples. It produced ton of output.

max_samples = np.array([64, 128, 256, 512])
test_one_parameter(DATASETS, get_distances_risf, "max_samples", max_samples, selected_obj_ratio = OBJ_RATIO)

RISF
numerical
graph
binary
timeseries
nlp
IForest
numerical
graph
binary
timeseries
nlp


Influence of number of trees

In [14]:
num_of_estimators = np.array([1, 5, 10, 25, 50, 100, 200, 300])
test_one_parameter(DATASETS, get_distances_risf, "n_estimators", num_of_estimators, selected_obj_ratio = OBJ_RATIO) 

RISF
numerical
graph
binary
timeseries
nlp
IForest
numerical
graph
binary
timeseries
nlp


Run all algorithms with default values to see when mean of auc converges.

In [1]:
def append_outcomes(source, dest):
    with open(source, 'r') as f:
        header = f.readline()
        with open(dest, 'a') as f2:
            for line in f:
                f2.write(line)

In [9]:
test_one_parameter(DATASETS, get_distances_risf, param_name=None, options=None, classifiers=["RISF", "LOF", "IForest", "ECOD", "HBOS"], custom_file_name="20holdouts_sensitivity_datasets", n_holdouts=20, selected_obj_ratio = OBJ_RATIO)

# Since ISF is implemented as RISF we need to rerun the experiment with just ISF

def get_distances_isf(X, data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence)]
    elif data_type == "numerical":
        return [ManhattanDist()]
    elif data_type == "binary":
        return [LinDist(X)]
    elif data_type == "nlp":
        return [ChebyshevDist()]
    elif data_type == "timeseries":
        return [DTWDist()]

test_one_parameter(DATASETS, get_distances_isf, param_name=None, options=None, classifiers=["ISF"], custom_file_name="20holdouts_sensitivity_datasets_isf", n_holdouts=20, selected_obj_ratio = 1)

append_outcomes(RESULTS_DIR / "20holdouts_sensitivity_datasets_isf.csv", RESULTS_DIR / "20holdouts_sensitivity_datasets.csv")

(RESULTS_DIR / "20holdouts_sensitivity_datasets_isf.csv").unlink()

RISF
numerical
graph
binary
timeseries
nlp
LOF
numerical
graph
binary
timeseries
nlp
IForest
numerical
graph
binary
timeseries
nlp
ECOD
numerical
graph
binary
timeseries
nlp
HBOS
numerical
graph
binary
timeseries
nlp
ISF
numerical
graph
binary
timeseries
nlp


Run RISF in dummy projection mode to see how implementation can make outcomes different

In [11]:
file_name = "numerical_dummy_dist"
test_one_parameter(DATASETS, lambda x, y: [SelectiveDistance(dummy_projection, 1, 1)], param_name=None, options=None, classifiers=["RISF"], custom_file_name=file_name)

append_outcomes(RESULTS_DIR / "if_default.csv" , RESULTS_DIR / f"{file_name}.csv")