#### Experiment with:
* Max_depth
* n_estimators
* selected_objects for distance calculation.

# Sensitivity Analysis

na AWS dostaje sie ssh i konsole. Use tmux.

Spot instance preffered vs on demand.

Patrzec ile kosztuje dana maszyna w regionie. US East OHio, Europe Ireland.

In [1]:
import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../data')


from utils import (init_results, 
                            ObjectsSelector,
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            get_binary_distances_choice,
                            experiment_risf_complex)

import numpy as np
from risf.distance_functions import *
from risf.distance import SelectiveDistance
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

OKABE_ITO_SCALE = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
%load_ext autoreload
%autoreload 2

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CLF = "RISF"

PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATASETS = {
    "graph":  ["p53"],
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "categorical":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "categorical" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/")
}


def distance(x, y):
    return np.dot(x, y)

DISTANCES = {
    "graph": [PortraitDivergenceDist(), IpsenMikailovDist()], # Sprawdz koniecznie czy w policzonych dystansach nie ma duzo nanow!
    "numerical": [[SelectiveDistance(manhattan_projection, 1, 2)]],
    "categorical": [[SelectiveDistance(jaccard_projection, 5, 10)]],
    "nlp": [[SelectiveDistance(cosine_projection, 50, 100)]],
    "cv": [[SelectiveDistance(cosine_projection, 50, 100)]],
    "timeseries": [DTWDist(), distance]
}

In [3]:
def test_one_parameter(param_name=None, options=None, classifiers = ["IForest", "RISF"]):
    # To allow default parameters run
    if param_name is None:
        param_name="default"
    if options is None:
        options = [1]

    results_all = []
    for clf in classifiers:
        print(clf)
        for data_type in DATASETS.keys():
            print(data_type)
            for dataset_name in DATASETS[data_type]:
                for option in options:
                    clf_kwargs = dict() if param_name == "default" else {param_name: option}
                    
                    data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                    
                    if param_name == "max_n":
                        clf_kwargs["max_n"] = option
                        distances = [[SelectiveDistance(manhattan_projection, 1, option)]]
                    else:
                        distances = DISTANCES[data_type]
                    
                    if clf != "RISF":
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs)
                    else:
                        if data_type in ["numerical", "categorical", "nlp", "cv"]:
                            aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                        else:
                            if data_type == "timeseries":
                                data["X"] = data["X"].astype(object)        
                            aucs = experiment_risf_complex(data, distances, selected_obj_ratio=1, clf_kwargs=clf_kwargs)
                            
                    results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                    results_all.extend(results)

    pd.DataFrame(results_all).to_csv(f"../results/results_{param_name}.csv", index=False)


#### At first let's see performance of IF, ISF and RISF defaults on the data

In [4]:
test_one_parameter(classifiers = ["IForest", "RISF", "ISF"])

IForest
graph
numerical
categorical
timeseries
nlp
RISF
graph
numerical
categorical
timeseries
nlp
ISF
graph
numerical
categorical
timeseries
nlp


#### Influence of subsampling_size

In [5]:
max_samples = np.array([64, 128, 256, 512])
test_one_parameter("max_samples", max_samples)

IForest
graph
numerical


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


categorical
timeseries


  warn(
  warn(
  warn(


nlp
RISF
graph
numerical
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
categorical
timeseries
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
max sample is bigger than number of sample selecting n_samples
nlp


### Influence of number of trees

In [6]:
num_of_estimators = np.array([1, 5, 10, 25, 50, 100, 200]) # Start from 1 to 100. In general smaller numbers
test_one_parameter("n_estimators", num_of_estimators) 

IForest
graph
numerical
categorical
timeseries
nlp
RISF
graph
numerical
categorical
timeseries
nlp


### Now experiments on parameters connected strictly with RISF

At first experiment on how number of max_n influences Selective Euclidean distance scores

In [8]:
DATASETS = {
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "nlp" : ["agnews_1"]
}

max_n = np.array([1,2,3,4,5])
test_one_parameter("max_n", max_n, classifiers=["RISF"]) 

RISF
numerical
nlp


Influence of number of selected_objects ratio

In [9]:
def distance(x, y):
    return np.dot(x, y)

DATASETS = {
    "graph":  ["p53"],
    "numerical": ["21_Lymphography", "36_speech", "6_cardio", "26_optdigits"],
    "categorical":  ["ad_nominal"],
    "timeseries": ["TwoLeadECG"],
    "nlp" : ["agnews_1"]
}

DISTANCES = {
    "graph": [IpsenMikailovDist()],
    "numerical" : [distance],
    "categorical":[distance],
    "timeseries": [DTWDist()],
    "nlp": [distance],
}

SELECTED_OBJ_RATIO = [0.1, 0.25, 0.5, 0.75, 1.0]


In [10]:
results_selected_obj = []

for data_type in DISTANCES.keys():
    for dataset_name in DATASETS[data_type]:
        objects_selector = ObjectsSelector() # TBD
        for obj_ratio in SELECTED_OBJ_RATIO:               
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")

            if data_type != "graph": # only graphs are inherently represented as objects
                data["X"] = data["X"].astype(object)
            aucs = experiment_risf_complex(data, DISTANCES[data_type], obj_ratio, selection_func = objects_selector, clf_kwargs= {})
            results = init_results(CLF, dataset_name, data_type, aucs, {"obj_ratio": obj_ratio})
            results_selected_obj.extend(results)

pd.DataFrame(results_selected_obj).to_csv("../results/results_selected_obj_Ratio.csv", index=False)

#### Influence of using different distance functions

In [17]:
OBJ_RATIO = 0.25 # Based on previous experiment

DISTANCES = {
    "categorical": np.array([SelectiveDistance(jaccard_projection, 1, 4) , SelectiveDistance(euclidean_projection,1,2),
                           SelectiveDistance(euclidean_projection,1,2)], dtype=object),
    "graph": np.array([IpsenMikailovDist(),  PortraitDivergenceDist()], dtype=object),
   "numerical" : np.array([SelectiveDistance(manhattan_projection, 1, 3),
                           SelectiveDistance(euclidean_projection, 1, 3), SelectiveDistance(cosine_projection, 2, 3) ], dtype=object),
   "nlp": np.array([SelectiveDistance(manhattan_projection, 1, 3),
                   SelectiveDistance(euclidean_projection, 1, 3), SelectiveDistance(cosine_projection, 2, 3) ], dtype=object)
}

In [18]:
results_distance_comb = []

for data_type in DISTANCES.keys():
    distances_to_use = get_binary_distances_choice(DISTANCES[data_type])
    print(data_type)
    for dataset_name in DATASETS[data_type]:
        for dist_to_use in distances_to_use:
            dist = list(DISTANCES[data_type][dist_to_use])
            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, "RISF")

            if data_type == "graph":
                aucs = experiment_risf_complex(data, dist, OBJ_RATIO, clf_kwargs={})
                dist_short = '_'.join([x.__class__.__name__[:3] for x in dist])
            else:
                aucs = perform_experiment_simple("RISF", data, clf_kwargs={}, distances = [dist])
                dist_short = '_'.join([x.projection_func.__name__[:3] for x in dist])
            
            
            results = init_results(CLF, dataset_name, data_type, aucs, {"distances": dist_short})
            
            results_distance_comb.extend(results)

pd.DataFrame(results_distance_comb).to_csv("../results/results_selected_distances.csv", index=False)


categorical
[<risf.distance.SelectiveDistance object at 0x000002010AD46940>, <risf.distance.SelectiveDistance object at 0x000002010AD46AF0>, <risf.distance.SelectiveDistance object at 0x000002010AD469D0>]
[<risf.distance.SelectiveDistance object at 0x000002010AD46940>, <risf.distance.SelectiveDistance object at 0x000002010AD46AF0>]
[<risf.distance.SelectiveDistance object at 0x000002010AD46940>, <risf.distance.SelectiveDistance object at 0x000002010AD469D0>]
[<risf.distance.SelectiveDistance object at 0x000002010AD46940>]
[<risf.distance.SelectiveDistance object at 0x000002010AD46AF0>, <risf.distance.SelectiveDistance object at 0x000002010AD469D0>]
[<risf.distance.SelectiveDistance object at 0x000002010AD46AF0>]
[<risf.distance.SelectiveDistance object at 0x000002010AD469D0>]
graph
[<risf.distance_functions.IpsenMikailovDist object at 0x000002010AD46F70>, <risf.distance_functions.PortraitDivergenceDist object at 0x000002010AD464C0>]
[<risf.distance_functions.IpsenMikailovDist object at

OverflowError: Range exceeds valid bounds

In [None]:
# Do some experiment on average path length