In [1]:
import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../data')


from utils import (init_results, 
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            experiment_risf_complex,
                            experiment_risf_mixed)

import numpy as np
from netrd.distance import NetSimile, PortraitDivergence, DegreeDivergence, IpsenMikhailov,  JaccardDistance, NetLSD, OnionDivergence
from risf.distance_functions import (GraphDist, manhattan_projection, jaccard_projection, cosine_projection, 
                                    euclidean_projection, DTWDist, EuclideanDist, WassersteinDist,
                                    EditDistanceSequencesOfSets, ManhattanDist, DiceDist, dummy_projection, LinDist, chebyshev_projection)
from risf.distance import SelectiveDistance
from pathlib import Path
import pandas as pd

%load_ext autoreload
%autoreload 2

np.random.seed(SEED)

In [2]:
PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATASETS = {
    #"graph":  ["NCI1", "AIDS", "ENZYMES", "PROTEINS_full", "DD"],
    #"numerical": [ "14_glass", "20_letter", "25_musk", "2_annthyroid", "31_satimage-2", "38_thyroid", "40_vowels", "41_Waveform",
    #"42_WBC" ,"43_WDBC", "44_Wilt",],
    "binary":  ["AID362red_train_allpossiblenominal", "apascal_entire_trainvsall", "Reuters-corn-100"],
    "nominal" : ["cmc-nominal", "solar-flare_FvsAll-cleaned"],
    #"timeseries": ["Earthquakes", "ECGFiveDays", "MiddlePhalanxOutlineCorrect", "SonyAIBORobotSurface1",],
    #"nlp" : ["amazon", "imdb", "yelp"],
    #"cv": ["CIFAR10_0", "FashionMNIST_0", "SVHN_0"],
    #"multiomics": ["breast", "ovarian", "rosmap"],
    #"seq_of_sets": ["items", "length", "order"]

}

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "binary" : Path("../data/categorical"),
    "nominal" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/"),
    "cv": Path("../data/adBench/CV_by_ResNet18/"),
    "multiomics": Path("../data/mixed"),
    "seq_of_sets": Path("../data/mixed"),

}

# Nie wybierac dystansow na oko tylko policzyc srednia pozycje w rankingach i na tej podstawie wybrac.
def get_distances_risf(X, data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence), ManhattanDist(), GraphDist(DegreeDivergence), GraphDist(NetLSD), EuclideanDist()]
    elif data_type == "numerical":
        return [SelectiveDistance(euclidean_projection, 1, 3)]
    elif data_type == "binary":
        return [SelectiveDistance(dummy_projection,1,1), LinDist(X)]
    elif data_type == "timeseries":
        return [SelectiveDistance(manhattan_projection, 1, 3), SelectiveDistance(chebyshev_projection, 5, 10),
                DTWDist()]
    elif data_type == "binary":
        return [SelectiveDistance(dummy_projection,1,1), LinDist(X)]
    elif data_type == "nominal":
        return [SelectiveDistance(dummy_projection,1,1), LinDist(X)]
    elif data_type in ["nlp", "cv"]:
        return [SelectiveDistance(manhattan_projection, 1, 3),SelectiveDistance(euclidean_projection, 1, 3), 
                SelectiveDistance(chebyshev_projection, 5, 10)]
    elif data_type == "histogram":
        return [WassersteinDist()]
    elif data_type == "multiomics":
        return [SelectiveDistance(cosine_projection, X.shape[1], X.shape[1]), SelectiveDistance(euclidean_projection, X.shape[1], X.shape[1])]
    elif data_type == "seq_of_sets":
        return [EditDistanceSequencesOfSets()]
    elif data_type == "bag_of_words":
        return [ManhattanDist()]
    
def get_distances_sf(X,data_type):
    if data_type == "graph":
        return [GraphDist(PortraitDivergence)]
    elif data_type == "numerical":
        return [[SelectiveDistance(euclidean_projection, X.shape[1], X.shape[1])]]
    elif data_type in ["binary", "nominal"]:
        return [LinDist(X)]
    elif data_type in ["nlp", "cv"]:
        return [[SelectiveDistance(cosine_projection, X.shape[1], X.shape[1])]]
    elif data_type == "timeseries":
        return [DTWDist()]
    elif data_type == "histogram":
        return [WassersteinDist()]
    elif data_type == "seq_of_sets":
        return [ManhattanDist()]
    elif data_type == "multiomics":
        return [SelectiveDistance(cosine_projection, X.shape[1], X.shape[1])]
    elif data_type == "bag_of_words":
        return [ManhattanDist()]

In [6]:
classifiers =  ["RISF", "IForest",  "ISF", "ECOD", "LOF", "HBOS"]
SELECTED_OBJ_RATIO = 0.5

for data_type in DATASETS.keys():
    print(data_type)
    results_all = []
    for clf in classifiers:
        print(clf)
        for dataset_name in DATASETS[data_type]:
            clf_kwargs = {} if clf != "RISF" else {}

            data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
            
            if clf == "RISF":
                if data_type in ["numerical", "nlp", "cv"]:
                    distances = get_distances_risf(data["X"], data_type)
                    aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                elif data_type in ["timeseries", "graph", "binary", "nominal"]:
                    distances = get_distances_risf(data["X"], data_type)
                    aucs = experiment_risf_complex(data, distances, selected_obj_ratio=SELECTED_OBJ_RATIO, clf_kwargs=clf_kwargs)
                else:
                    feature_types = data["features_types"]
                    distances = []
                    for i, feature_type in enumerate(feature_types):
                        distances.append(get_distances_risf(data["X"][i], feature_type))
                        
                    aucs = experiment_risf_mixed(data, distances, selected_obj_ratio=SELECTED_OBJ_RATIO, clf_kwargs=clf_kwargs)
                    
            elif clf == "ISF":
                distances = get_distances_sf(data["X"], data_type)

                if data_type in ["numerical", "categorical", "nlp", "cv", "multiomics"]:
                    aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances)
                else:
                    aucs = experiment_risf_complex(data, distances, selected_obj_ratio=1, clf_kwargs=clf_kwargs)

            else:
                aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs)

            results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
            results_all.extend(results)
                    
    pd.DataFrame(results_all).to_csv(f"../results/experiments/{data_type}.csv", index=False)


binary
RISF
IForest
ISF
ECOD
LOF
HBOS
nominal
RISF
IForest
ISF
ECOD
LOF
HBOS
