# Sensitivity Analysis

In [7]:
import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../data')


from utils import (init_results, 
                            ObjectsSelector,
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            get_binary_distances_choice,
                            experiment_risf_complex)

import numpy as np
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov,  JaccardDistance, NetLSD, OnionDivergence
from risf.distance_functions import GraphDist, manhattan_projection, jaccard_projection, cosine_projection, euclidean_projection, DTWDist, EuclideanDist, ManhattanDist
from risf.distance import SelectiveDistance
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd

%load_ext autoreload
%autoreload 2

np.random.seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
CLF = "RISF"

PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATASETS = {
    "graph":  ["COX2"],
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "categorical":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "categorical" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/")
}

DISTANCES = {
    "graph": [GraphDist(IpsenMikhailov)],
    "numerical": [[SelectiveDistance(manhattan_projection, 1, 3)]],
    "categorical": [[SelectiveDistance(jaccard_projection, 10, 10)]],
    "nlp": [[SelectiveDistance(cosine_projection, 400, 400)]],
    "cv": [[SelectiveDistance(cosine_projection, 400, 400)]],
    "timeseries": [DTWDist(), EuclideanDist()]
}

In [12]:
def test_one_parameter(param_name=None, options=None, classifiers = ["IForest", "RISF"]):
    # To allow default parameters run
    if param_name is None:
        param_name="default"
    if options is None:
        options = [1]

    results_all = []
    for clf in classifiers:
        print(clf)
        for data_type in DATASETS.keys():
            print(data_type)
            for dataset_name in DATASETS[data_type]:
                for option in options:
                    clf_kwargs = dict() if param_name == "default" else {param_name: option}
                    
                    data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                    
                    if param_name == "max_n":
                        clf_kwargs["max_n"] = option
                        distances = [[SelectiveDistance(manhattan_projection, 1, option)]]
                    else:
                        distances = DISTANCES[data_type]
                    
                    if clf != "RISF":
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs)
                    else:
                        if data_type in ["numerical", "categorical", "nlp", "cv"]:
                            aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                        else:
                            if data_type == "timeseries":
                                data["X"] = data["X"].astype(object)        
                            aucs = experiment_risf_complex(data, distances, selected_obj_ratio=1, clf_kwargs=clf_kwargs)
                            
                    results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                    results_all.extend(results)

    pd.DataFrame(results_all).to_csv(f"../results/results_{param_name}_graph.csv", index=False)


#### At first let's see performance of IF, ISF and RISF defaults on the data

In [3]:
test_one_parameter(classifiers = ["IForest", "RISF", "ISF"])

NameError: name 'test_one_parameter' is not defined

#### Influence of subsampling_size

In [5]:
max_samples = np.array([64, 128, 256, 512])
test_one_parameter("max_samples", max_samples)

IForest
graph
numerical


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


categorical
timeseries


  warn(
  warn(
  warn(


nlp
RISF
graph
numerical
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
categorical
timeseries
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
nlp


### Influence of number of trees

In [6]:
num_of_estimators = np.array([1, 5, 10, 25, 50, 100, 200]) # Start from 1 to 100. In general smaller numbers
test_one_parameter("n_estimators", num_of_estimators) 

IForest
graph
numerical
categorical
timeseries
nlp
RISF
graph
numerical
categorical
timeseries
nlp


### Now experiments on parameters connected strictly with RISF

At first experiment on how number of max_n influences Selective Euclidean distance scores

In [8]:
DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "nlp" : ["agnews_1"]
}

max_n = np.array([1,2,3,4,5])
test_one_parameter("max_n", max_n, classifiers=["RISF"]) 

RISF
numerical
nlp


Influence of number of selected_objects ratio

In [2]:
DATASETS = {
    "graph":  ["IMDB-BINARY"],
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "categorical":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

DISTANCES = {
    "graph": [GraphDist(IpsenMikhailov)],
    "numerical" : [EuclideanDist()],
    "categorical":[EuclideanDist()],
    "timeseries": [DTWDist()],
    "nlp": [EuclideanDist()],
}

SELECTED_OBJ_RATIO = [0.1, 0.25, 0.5, 0.75, 1.0]


In [17]:
results_selected_obj = []

for data_type in DISTANCES.keys():
    for dataset_name in DATASETS[data_type]:
        objects_selector = ObjectsSelector() # TBD
        for obj_ratio in SELECTED_OBJ_RATIO:               
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")

            if data_type != "graph": # only graphs are inherently represented as objects
                data["X"] = data["X"].astype(object)
            aucs = experiment_risf_complex(data, DISTANCES[data_type], obj_ratio, selection_func = objects_selector, clf_kwargs= {})
            results = init_results(CLF, dataset_name, data_type, aucs, {"obj_ratio": obj_ratio})
            results_selected_obj.extend(results)

pd.DataFrame(results_selected_obj).to_csv("../results/results_selected_obj_Ratio.csv", index=False)

#### Influence of using different distance functions

Concerning graphs I did one experiment that was using literally everything. However these distance functions ar the worst for graphs:
* Jaccard
* Onion

So I don't consider them here at all

In [8]:
OBJ_RATIO = 1 # Based on previous experiment


DISTANCES = {
    # "categorical": np.array([SelectiveDistance(jaccard_projection, 1, 4) , SelectiveDistance(euclidean_projection,1,2),
    #                        SelectiveDistance(euclidean_projection,1,2)], dtype=object),
   "graph": np.array([GraphDist(IpsenMikhailov), GraphDist(PortraitDivergence), ManhattanDist(),
                    GraphDist(NetSimile), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()], dtype=object),
#    "numerical" : np.array([SelectiveDistance(manhattan_projection, 1, 3),
#                            SelectiveDistance(euclidean_projection, 1, 3), SelectiveDistance(cosine_projection, 2, 3)], dtype=object),
#    "nlp": np.array([SelectiveDistance(manhattan_projection, 1, 3),
#                    SelectiveDistance(euclidean_projection, 1, 3), SelectiveDistance(cosine_projection, 2, 3) ], dtype=object)
}

In [15]:
results_distance_comb = []

for data_type in DISTANCES.keys():
    distances_to_use = get_binary_distances_choice(DISTANCES[data_type])
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")
        for dist_to_use in distances_to_use:
            dist = list(DISTANCES[data_type][dist_to_use])
            if dist_to_use.sum() > 3:
                continue
            if data_type == "graph":
                aucs = experiment_risf_complex(data, dist, OBJ_RATIO, clf_kwargs={})
                dist_short = '_'.join([x.distance.__class__.__name__[:4] if 
                                    isinstance(x, GraphDist) else x.__class__.__name__[:4] for x in dist])
                print(dist_short)
            else:
                aucs = perform_experiment_simple("RISF", data, clf_kwargs={}, distances = [dist])
                dist_short = '_'.join([x.projection_func.__name__[:4] for x in dist])
            
            
            results = init_results(CLF, dataset_name, data_type, aucs, {"distances": dist_short})
            
            results_distance_comb.extend(results)

pd.DataFrame(results_distance_comb).to_csv("../results/results_selected_distances.csv", index=False)


graph
Ipse_Port_Manh
Ipse_Port_NetS
Ipse_Port_Degr
Ipse_Port_NetL
Ipse_Port_Eucl
Ipse_Port
Ipse_Manh_NetS
Ipse_Manh_Degr
Ipse_Manh_NetL
Ipse_Manh_Eucl
Ipse_Manh
Ipse_NetS_Degr
Ipse_NetS_NetL
Ipse_NetS_Eucl
Ipse_NetS
Ipse_Degr_NetL
Ipse_Degr_Eucl
Ipse_Degr
Ipse_NetL_Eucl
Ipse_NetL
Ipse_Eucl
Ipse
Port_Manh_NetS
Port_Manh_Degr
Port_Manh_NetL
Port_Manh_Eucl
Port_Manh
Port_NetS_Degr
Port_NetS_NetL
Port_NetS_Eucl
Port_NetS
Port_Degr_NetL
Port_Degr_Eucl
Port_Degr
Port_NetL_Eucl
Port_NetL
Port_Eucl
Port
Manh_NetS_Degr
Manh_NetS_NetL
Manh_NetS_Eucl
Manh_NetS
Manh_Degr_NetL
Manh_Degr_Eucl
Manh_Degr
Manh_NetL_Eucl
Manh_NetL
Manh_Eucl
Manh
NetS_Degr_NetL
NetS_Degr_Eucl
NetS_Degr
NetS_NetL_Eucl
NetS_NetL
NetS_Eucl
NetS
Degr_NetL_Eucl
Degr_NetL
Degr_Eucl
Degr
NetL_Eucl
NetL
Eucl


In [16]:
df = pd.read_csv("../results/results_selected_distances_cox2.csv")

In [17]:
mean_ = df.groupby("distances").auc.mean().sort_values(ascending=False).head(20)

In [18]:
var = df.groupby("distances").auc.std()[mean_.index]

In [19]:
df = pd.DataFrame([mean_, var]).T
df.columns =["mean", "std"]
df

Unnamed: 0_level_0,mean,std
distances,Unnamed: 1_level_1,Unnamed: 2_level_1
Port_Manh_Eucl,0.6491,0.123916
Manh_NetL_Eucl,0.6303,0.060009
NetL_Eucl,0.613633,0.091566
Manh_NetL,0.6079,0.066284
Port_Manh,0.590933,0.159616
Port_Manh_NetL,0.583633,0.117032
Port_NetL_Eucl,0.573667,0.010492
Eucl,0.5697,0.024233
Manh_Eucl,0.569367,0.132899
Port_NetL,0.5676,0.086473
