# Sensitivity Analysis

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../data')

from utils import (init_results, 
                            ObjectsSelector,
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            get_binary_distances_choice,
                            experiment_risf_complex)

import numpy as np
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov, NetLSD
from risf.distance_functions import *
from risf.distance import SelectiveDistance
from pathlib import Path
import pandas as pd

np.random.seed(SEED)

CLF = "RISF"
PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "binary" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/")
}

In [2]:
def test_one_parameter(datasets, distances_func, param_name=None, options=None, classifiers = ["IForest", "RISF"]):
    # To allow default parameters run
    if param_name is None:
        param_name="default"
    if options is None:
        options = [1]

    results_all = []
    for clf in classifiers:
        print(clf)
        for data_type in datasets.keys():
            print(data_type)
            for dataset_name in datasets[data_type]:
                for option in options:
                    clf_kwargs = dict() if param_name == "default" else {param_name: option}
                    
                    data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                    
                    if param_name == "max_n":
                        clf_kwargs["max_n"] = option
                        distances = [[SelectiveDistance(manhattan_projection, 1, option)]]
                    else:
                        distances = distances_func(data["X"], data_type)
                    
                    if clf != "RISF":
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs)
                    else:
                        if data_type in ["numerical", "binary", "nlp", "cv"]:
                            aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                        else:
                            if data_type == "timeseries":
                                data["X"] = data["X"].astype(object)        
                            aucs = experiment_risf_complex(data, distances, selected_obj_ratio=1, clf_kwargs=clf_kwargs)
                            
                    results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                    results_all.extend(results)

    pd.DataFrame(results_all).to_csv(f"../results/results_{param_name}.csv", index=False)


#### At first lets verify what is the best vector size for numerical datasets

In [4]:
DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "nlp" : ["agnews_1"]
}

max_n = np.array([1,2,3,4,5])
test_one_parameter(DATASETS, None, "max_n", max_n, classifiers=["RISF"]) 

RISF
numerical
nlp


#### Influence of using different distance functions

In [12]:
DATASETS = {
    #"numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    #"graph":  ["COX2", "BZR", "DHFR"],
    #"binary":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

OBJ_RATIO = 1

def get_distances_risf(X, data_type) -> np.ndarray:
    if data_type == "graph":
        return np.array([GraphDist(IpsenMikhailov), GraphDist(PortraitDivergence), ManhattanDist(),
                    GraphDist(NetSimile), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()], dtype=object)
    elif data_type == "numerical":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3), 
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(cosine_projection, X.shape[1] // 2, X.shape[1] // 2),
                        SelectiveDistance(chebyshev_projection, X.shape[1] // 3, X.shape[1] // 3)], dtype=object)
    # elif data_type == "binary": #! This should be definitely changed
    #     return np.array([SelectiveDistance(jaccard_projection, X.shape[1] // 2, X.shape[1] // 2),
    #                     SelectiveDistance(euclidean_projection, 1, 3),
    #                     SelectiveDistance(manhattan_projection, 1, 3),
    #                     SelectiveDistance(chebyshev_projection, X.shape[1] // 3, X.shape[1] // 3)], dtype=object)
    elif data_type == "binary":
        return np.array([SelectiveDistance(jaccard_projection, 1, 3),
                        SelectiveDistance(manhattan_projection, 1, 1)], dtype=object)
    # elif data_type == "nlp":
    #     return np.array([SelectiveDistance(manhattan_projection, X.shape[1] // 2, X.shape[1] // 2),
    #                     SelectiveDistance(euclidean_projection, X.shape[1] // 2, X.shape[1] // 2), 
    #                     SelectiveDistance(cosine_projection, X.shape[1] // 2,  X.shape[1] // 2),
    #                     SelectiveDistance(chebyshev_projection, X.shape[1] // 3, X.shape[1] // 3)], dtype=object)
    elif data_type == "nlp":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(chebyshev_projection, 5, 10)], dtype=object)
    elif data_type == "timeseries": #! Check this with Selective distance too
        #return np.array([DTWDist(), EuclideanDist(), ManhattanDist(), ChebyshevDist()], dtype=object)
        return np.array([SelectiveDistance(manhattan_projection, 1, 3),
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(chebyshev_projection, 5, 10)], dtype=object)

In [13]:
for data_type in DATASETS.keys():
    results_distance_comb = []
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
        distances = get_distances_risf(data["X"], data_type)
        distances_to_use = get_binary_distances_choice(distances)
        for dist_to_use in distances_to_use:
            dist = list(distances[dist_to_use])
            if dist_to_use.sum() > 3:
                continue
            if data_type in ["graph"]:
                aucs = experiment_risf_complex(data, dist, OBJ_RATIO, clf_kwargs={})
                dist_short = '_'.join([x.distance.__class__.__name__[:4] if 
                                    isinstance(x, GraphDist) else x.__class__.__name__[:4] for x in dist])
            else:
                aucs = perform_experiment_simple("RISF", data, clf_kwargs={}, distances = [dist])
                dist_short = '_'.join([x.projection_func.__name__[:4] for x in dist])
            
            
            results = init_results(CLF, dataset_name, data_type, aucs, {"distances": dist_short})
            
            results_distance_comb.extend(results)

    pd.DataFrame(results_distance_comb).to_csv(f"../results/results_selected_distances_{data_type}.csv", index=False)

#pd.DataFrame(results_distance_comb).to_csv("../results/results_selected_distances.csv", index=False)


timeseries
nlp


In [2]:
# Now i should have defaults for distances
def get_distances_risf(X, data_type):
    if data_type == "graph":
        return np.array([GraphDist(IpsenMikhailov), GraphDist(PortraitDivergence), ManhattanDist(),
                    GraphDist(NetSimile), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()], dtype=object),
    elif data_type == "numerical":
        return np.array([SelectiveDistance(manhattan_projection, 1, 3), 
                        SelectiveDistance(euclidean_projection, 1, 3), 
                        SelectiveDistance(cosine_projection, X.shape[1] // 2, X.shape[1] // 2),
                        SelectiveDistance(chebyshev_projection, X.shape[1] // 3, X.shape[1] // 3)], dtype=object)
    elif data_type == "categorical":
        return np.array([SelectiveDistance(jaccard_projection, X.shape[1] // 2, X.shape[1] // 2),
                        SelectiveDistance(euclidean_projection, 1, 3),
                        SelectiveDistance(manhattan_projection, 1, 3)], dtype=object)
    elif data_type == "nlp":
        return np.array([SelectiveDistance(manhattan_projection, X.shape[1] // 2, X.shape[1] // 2),
                        SelectiveDistance(euclidean_projection, X.shape[1] // 2, X.shape[1] // 2), 
                        SelectiveDistance(cosine_projection, X.shape[1] // 2,  X.shape[1] // 2),
                        SelectiveDistance(chebyshev_projection, X.shape[1] // 3, X.shape[1] // 3)], dtype=object)
    elif data_type == "timeseries":
        return [DTWDist(), EuclideanDist(), ManhattanDist(), ChebyshevDist()]

#### Now test influence of selected objects ratio

In [17]:
SELECTED_OBJ_RATIO = [0.1, 0.25, 0.5, 0.75, 1.0]
results_selected_obj = []

for data_type in DATASETS.keys():
    for dataset_name in DATASETS[data_type]:
        objects_selector = ObjectsSelector() # TBD
        for obj_ratio in SELECTED_OBJ_RATIO:               
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")

            if data_type != "graph": # only graphs are inherently represented as objects
                data["X"] = data["X"].astype(object)
            aucs = experiment_risf_complex(data, get_distances_risf(data["X"], data_type), obj_ratio, selection_func = objects_selector, clf_kwargs= {})
            results = init_results(CLF, dataset_name, data_type, aucs, {"obj_ratio": obj_ratio})
            results_selected_obj.extend(results)

pd.DataFrame(results_selected_obj).to_csv("../results/results_selected_obj_Ratio.csv", index=False)

#### Influence of subsampling_size

In [5]:
import warnings
warnings.filterwarnings("ignore") # There are few very small datasets for which this doesn't change max_samples

max_samples = np.array([64, 128, 256, 512])
test_one_parameter(DATASETS, get_distances_risf, "max_samples", max_samples)

IForest
graph
numerical


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


categorical
timeseries


  warn(
  warn(
  warn(


nlp
RISF
graph
numerical
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
categorical
timeseries
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
nlp


### Influence of number of trees

In [6]:
num_of_estimators = np.array([1, 5, 10, 25, 50, 100, 200]) # Start from 1 to 100. In general smaller numbers
test_one_parameter(DATASETS, get_distances_risf, "n_estimators", num_of_estimators) 

IForest
graph
numerical
categorical
timeseries
nlp
RISF
graph
numerical
categorical
timeseries
nlp
