In [1]:
import sys
sys.path.insert(0, '../data')


from utils import (init_results, 
                            get_dataset,
                            perform_experiment_simple,
                            SEED,
                            PRECOMPUTED_DISTANCES_PATH,
                            experiment_rsif_complex,
                            experiment_rsif_mixed,
                            ensure_existance,
                            get_logger,
                            check_precomputed_notebook,
                            NJobs)

check_precomputed_notebook()

import numpy as np
from netrd.distance import PortraitDivergence, DegreeDivergence, NetLSD
from rsif.distance_functions import (GraphDist, manhattan_projection, cosine_projection,
                                    euclidean_projection, DTWDist, EuclideanDist, WassersteinDist, CosineDist, Goodall3Dist, OFDist, 
                                    BagOfWord_projection, BagOfWordDist, NumericalReprDist, NumericalReprDist,
                                    EditDistanceSequencesOfSets, ManhattanDist, ChebyshevDist, dummy_projection, LinDist, chebyshev_projection)
from rsif.distance import SelectiveDistance
from pathlib import Path
import pandas as pd

%load_ext autoreload
%autoreload 2

np.random.seed(SEED)

N_JOBS = -1
NJobs.set_n_jobs(N_JOBS)

LOGGER = get_logger("experiments")

In [2]:
PRECOMPUTED_DISTANCES_PATH.mkdir(exist_ok=True)

DATASETS = {
    "graph":  ["NCI1", "AIDS", "ENZYMES", "PROTEINS_full", "DD"],
    "numerical": [ "14_glass", "20_letter", "25_musk", "2_annthyroid", "31_satimage-2", "38_thyroid", "40_vowels", "41_Waveform",
    "42_WBC" ,"43_WDBC", "44_Wilt",],
    "binary":  ["AID362red_train_allpossiblenominal", "apascal_entire_trainvsall", "Reuters-corn-100"],
    "nominal" : ["cmc-nominal", "solar-flare_FvsAll-cleaned"],
    "timeseries": ["Earthquakes", "ECGFiveDays", "MiddlePhalanxOutlineCorrect", "SonyAIBORobotSurface1",],
    "nlp" : ["amazon", "imdb", "yelp"],
    "cv": ["CIFAR10_0", "FashionMNIST_0", "SVHN_0"],
    "multiomics": ["breast", "ovarian", "rosmap"],
    "seq_of_sets": ["items", "length", "order"]

}

DATA_DIRS = {
    "graph": Path("../data/graph"),
    "numerical" : Path("../data/numerical"),
    "binary" : Path("../data/categorical"),
    "nominal" : Path("../data/categorical"),
    "timeseries": Path("../data/timeseries"),
    "nlp": Path("../data/adBench/NLP_by_RoBERTa/"),
    "cv": Path("../data/adBench/CV_by_ResNet18/"),
    "multiomics": Path("../data/mixed"),
    "seq_of_sets": Path("../data/mixed"),

}

def get_distances_rsif(X, data_type):
    if data_type == "graph":
        return np.array([GraphDist(PortraitDivergence), SelectiveDistance(BagOfWord_projection, 1, 1),
                    GraphDist(DegreeDivergence), GraphDist(NetLSD), NumericalReprDist()], dtype=object)
    elif data_type == "numerical":
        return np.array([SelectiveDistance(manhattan_projection, 1, 1), 
                        SelectiveDistance(euclidean_projection, 1, 1), 
                        SelectiveDistance(cosine_projection, 2, 2),
                        SelectiveDistance(chebyshev_projection, 2, 2)], dtype=object)
    elif data_type == "timeseries":
        return np.array([SelectiveDistance(dummy_projection, 1, 1), SelectiveDistance(chebyshev_projection, 2, 2),
                SelectiveDistance(euclidean_projection, 1, 1), DTWDist()], dtype=object)
    elif data_type in ["binary", "nominal"]:
        return np.array([SelectiveDistance(dummy_projection,1,1), LinDist(X), Goodall3Dist(X), OFDist(X)], dtype=object)
    elif data_type in ["nlp", "cv"]:
        return np.array([SelectiveDistance(dummy_projection, 1, 1), SelectiveDistance(euclidean_projection, 1, 1), 
                SelectiveDistance(chebyshev_projection, 2, 2), SelectiveDistance(cosine_projection, 2, 2)], dtype=object)
    elif data_type == "histogram":
        return np.array([SelectiveDistance(dummy_projection,1,1), WassersteinDist()], dtype=object)
    elif data_type == "multiomics":
        return np.array([SelectiveDistance(dummy_projection,1,1), CosineDist(), EuclideanDist()], dtype=object)
    elif data_type == "seq_of_sets":
        return np.array([EditDistanceSequencesOfSets(), SelectiveDistance(dummy_projection, 1, 1), ManhattanDist()], dtype=object)
    
def get_distances_sf(X,data_type):
    if data_type == "graph":
        return np.array([GraphDist(PortraitDivergence), BagOfWordDist(),
                    GraphDist(DegreeDivergence), GraphDist(NetLSD), NumericalReprDist()], dtype=object)
    elif data_type == "numerical":
        return np.array([ManhattanDist(), EuclideanDist(), ChebyshevDist(), CosineDist()], dtype=object)
    elif data_type in ["binary", "nominal"]:
        return np.array([LinDist(X), Goodall3Dist(X), OFDist(X)], dtype=object)
    elif data_type in ["nlp", "cv"]:
        return np.array([ManhattanDist(), EuclideanDist(), ChebyshevDist(), CosineDist()], dtype=object)
    elif data_type == "timeseries":
        return np.array([CosineDist(), ChebyshevDist(), ManhattanDist() , DTWDist(), EuclideanDist()], dtype=object)
    elif data_type == "histogram":
        return np.array([WassersteinDist()], dtype=object)
    elif data_type == "seq_of_sets":
        return np.array([ManhattanDist(), CosineDist(), EditDistanceSequencesOfSets()], dtype=object)
    elif data_type == "multiomics":
        return np.array([CosineDist(), EuclideanDist()], dtype=object)


In [4]:
classifiers =  ["RSIF", "ISF", "LOF", "IForest", "ECOD", "HBOS"]
SELECTED_OBJ_RATIO = 0.5
EXP_DIR = Path("../results/experiments/")
ensure_existance([EXP_DIR])

LOGGER.info("Starting experiments")
for data_type in DATASETS.keys():
    LOGGER.info(f"Data type: {data_type}")
    results_all = []
    for clf in classifiers:
        LOGGER.info(f"Classifier: {clf}")
        for dataset_name in DATASETS[data_type]:
            LOGGER.info(f"Dataset: {dataset_name}")
            try:
                clf_kwargs = {} if clf != "RSIF" else {}

                data = get_dataset(data_type, DATA_DIRS[data_type], dataset_name, clf)
                data['type'] = data_type
                if clf == "RSIF":
                    if data_type in ["numerical", "nlp", "cv"]:
                        distances = get_distances_rsif(data["X"], data_type)
                        aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, distances = distances, optimize_distances=True)
                    elif data_type in ["timeseries", "graph", "binary", "nominal", "seq_of_sets"]:
                        distances = get_distances_rsif(data["X"], data_type)
                        aucs = experiment_rsif_complex(clf, data, distances, selected_obj_ratio=SELECTED_OBJ_RATIO, clf_kwargs=clf_kwargs, optimize_distances=True)
                    else:
                        feature_types = data["features_types"]
                        distances = []
                        for i, feature_type in enumerate(feature_types):
                            distances.append(get_distances_rsif(data["X"][i], feature_type))
                            
                        aucs = experiment_rsif_mixed(data, distances, selected_obj_ratio=SELECTED_OBJ_RATIO, clf_kwargs=clf_kwargs, optimize_distances=True)
                        
                elif clf == "ISF":
                    distances = get_distances_sf(data["X"], data_type)
                    aucs = experiment_rsif_complex(clf, data, distances, selected_obj_ratio=1, clf_kwargs=clf_kwargs, optimize_distances=True)

                else:
                    aucs = perform_experiment_simple(clf, data, clf_kwargs=clf_kwargs, optimize_distances=True)

                results = init_results(clf, dataset_name, data_type, aucs, clf_kwargs)
                results_all.extend(results)
            except Exception as e:
                LOGGER.error(f"Error in {dataset_name} {data_type} {clf} {e}")
                continue
                    
    pd.DataFrame(results_all).to_csv( EXP_DIR / f"{data_type}.csv", index=False)
