In [61]:
#!/usr/bin/env python3

import argparse, os, numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression  # for LogReg
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
import argparse
from pathlib import Path
import typing as ty
import json
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from xgboost import XGBClassifier, XGBRegressor
from argparse import Namespace
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score




def load_json(path):
    return json.loads(Path(path).read_text())


DATA_PATH= ""
ArrayDict = ty.Dict[str, np.ndarray]

def dataname_to_numpy(dataset_name, dataset_path):

    """
    Load the dataset from the numpy files.

    :param dataset_name: str
    :param dataset_path: str
    :return: Tuple[ArrayDict, ArrayDict, ArrayDict, Dict[str, Any]]
    """
    dir_ = Path(os.path.join(DATA_PATH, dataset_path, dataset_name))

    def load(item) -> ArrayDict:
        return {
            x: ty.cast(np.ndarray, np.load(dir_ / f'{item}_{x}.npy', allow_pickle = True))  
            for x in ['train', 'val', 'test']
        }

    return (
        load('N') if dir_.joinpath('N_train.npy').exists() else None,
        load('C') if dir_.joinpath('C_train.npy').exists() else None,
        load('y'),
        load_json(dir_ / 'info.json'),
    )


def detect_and_log_outliers(split_name, data, dataset_name, method, log_dir="outlier_logs", sample_dir="outlier_samples"):
    df = pd.DataFrame(data)

    numeric_df = df.select_dtypes(include=[np.number])
    numeric_data = numeric_df.to_numpy()


    if method == "IsolationForest":
        clf = IsolationForest(random_state=42)
        mask = clf.fit_predict(numeric_data) == 1
    elif method == "LocalOutlierFactor":
        clf = LocalOutlierFactor(n_neighbors=20)
        mask = clf.fit_predict(numeric_data) == 1
    elif method == "OneClassSVM":
        clf = OneClassSVM(nu=0.05, kernel="rbf", gamma="scale")
        mask = clf.fit_predict(numeric_data) == 1
    elif method == "ZScore":
        z = np.abs((numeric_data - numeric_data.mean(axis=0)) / (numeric_data.std(axis=0) + 1e-9))
        mask = (z < 3).all(axis=1)
    elif method == "ModifiedZScore":
        med = np.median(numeric_data, axis=0)
        mad = np.median(np.abs(numeric_data - med), axis=0)
        mz = 0.6745 * (numeric_data - med) / (mad + 1e-9)
        mask = (np.abs(mz) < 3.5).all(axis=1)
    elif method == "IQR":
        Q1 = np.percentile(numeric_data, 25, axis=0)
        Q3 = np.percentile(numeric_data, 75, axis=0)
        IQR = Q3 - Q1
        mask = ((numeric_data >= Q1 - 1.5 * IQR) & (numeric_data <= Q3 + 1.5 * IQR)).all(axis=1)
    elif method == "HBOS":
        clf = HBOS()
        clf.fit(numeric_data)
        mask = clf.labels_ == 0  # 0 = inlier, 1 = outlier

    elif method == "KDE":
        kde = KernelDensity(kernel="gaussian", bandwidth=1.0)
        kde.fit(numeric_data)
        log_density = kde.score_samples(numeric_data)
        threshold = np.percentile(log_density, 5)  # bottom 5% as outliers
        mask = log_density > threshold
    else:
        raise ValueError(f"Unknown method: {method}")

    num_outliers = len(data) - mask.sum()

    os.makedirs(log_dir, exist_ok=True)
    with open(os.path.join(log_dir, f"{dataset_name}__{split_name}__{method}.txt"), "w") as f:
        f.write(f"Outliers: {num_outliers} / {len(data)}\n")

    
    return mask

def make_model(model_name: str, task_type: str):

    model_name = model_name.lower()
    task_type = task_type.lower()

    if model_name == "mlp":
        if task_type == "classification":
            return MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
        elif task_type == "regression":
            return MLPRegressor(hidden_layer_sizes=(100,), max_iter=500)

    elif model_name == "xgboost":
        if task_type == "classification":
            return XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        elif task_type == "regression":
            return XGBRegressor()

    elif model_name == "linear":
        if task_type == "classification":
            return LogisticRegression(max_iter=1000)
        elif task_type == "regression":
            return LinearRegression()

    elif model_name == "randomforest":
        if task_type == "classification":
            return RandomForestClassifier(n_estimators=100)
        elif task_type == "regression":
            return RandomForestRegressor(n_estimators=100)

    else:
        raise ValueError(f"Unsupported model: {model_name}")

def already_logged(dataset_name, model_name, outlier_method=None, log_dir="results_logs"):
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, "results.outliers.txt" if outlier_method else "results.txt")
    if not os.path.exists(log_file):
        return False
    with open(log_file, "r") as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.split(",")
            if outlier_method:
                if len(parts) == 4 and parts[0] == dataset_name and parts[1] == model_name and parts[3] == outlier_method:
                    return True
            else:
                if len(parts) == 3 and parts[0] == dataset_name and parts[1] == model_name:
                    return True
    return False
import os
from sklearn.metrics import accuracy_score, mean_squared_error

import numpy as np
import pandas as pd

def simple_preprocess(X):
    """
    Preprocesses only numerical columns:
      - NaNs replaced with column mean
      - Standardization (mean 0, std 1)
    """

    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    numeric_cols = X.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        mean_val = X[col].mean()
        X[col] = X[col].fillna(mean_val)

    for col in numeric_cols:
        col_mean = X[col].mean()
        col_std = X[col].std()
        if col_std != 0:
            X[col] = (X[col] - col_mean) / col_std
        else:
            X[col] = 0  

    return X.values


def evaluate_and_log(model, X_test, y_test, dataset_name, model_name, task_type, use_outliers, outlier_method, log_dir="."):
    """
    Evaluates a model and logs results.

    Parameters
    ----------
    model : trained model
    X_test, y_test : test data
    dataset_name : str
    model_name : str
    task_type : str ("classification" or "regression")
    use_outliers : bool (True if outliers were removed before training)
    outlier_method : str or None
    log_dir : str (folder for results files)
    """
    
    if task_type == "classification":
        predictions = model.predict(X_test)
        metric_value = accuracy_score(y_test, predictions)
    elif task_type == "regression":
        predictions = model.predict(X_test)
        metric_value = mean_squared_error(y_test, predictions)
    else:
        raise ValueError(f"Unknown task type: {task_type}")

    filename = "results.outliers.txt" if use_outliers else "results.txt"
    filepath = os.path.join(log_dir, filename)
    os.makedirs(log_dir, exist_ok=True)

    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            if use_outliers:
                search_str = f"{dataset_name},{model_name},{outlier_method}"
            else:
                search_str = f"{dataset_name},{model_name}"
            if any(search_str in line for line in f):
                print(f"[SKIP] Experiment already exists in {filename}: {search_str}")
                return metric_value

    with open(filepath, "a") as f:
        if use_outliers:
            f.write(f"{dataset_name},{model_name},{metric_value:.6f},{outlier_method}\n")
        else:
            f.write(f"{dataset_name},{model_name},{metric_value:.6f}\n")

    print(f"[LOGGED] {dataset_name}, {model_name}, {metric_value:.6f}" + (f", {outlier_method}" if use_outliers else ""))
    return metric_value

   
def run_experiment(dataset = "cmc", dataset_path="data", model_type = "xgboost", remove_outliers = "", outlier_method = "ZScore"):
   
    
    if remove_outliers:
        remove_outliers = "--remove_outliers"
    args = Namespace(
        dataset=dataset,
        dataset_path=dataset_path,
        model_type=model_type,
        remove_outliers=remove_outliers,
        outlier_method=outlier_method
    )


    
    N, C, y, info = dataname_to_numpy(args.dataset, args.dataset_path)
    def split_dict(d): return ({k: d[k] for k in ("train", "val") if k in d}, { "test": d["test"] }) if d else (None, None)
    N_trval, N_test = split_dict(N)
    C_trval, C_test = split_dict(C)
    y_trval, y_test = split_dict(y)

    if args.remove_outliers:
        def comb(N_arr, C_arr):
            return np.hstack([N_arr, C_arr]) if C_arr is not None else N_arr
        for split, N_arr, C_arr in [("train", N_trval["train"], C_trval["train"] if C_trval else None),
                                   ("val", N_trval["val"], C_trval["val"] if C_trval else None),
                                   ("test", N_test["test"], C_test["test"] if C_test else None)]:
            data = comb(N_arr, C_arr)
            mask = detect_and_log_outliers(split, data, args.dataset, args.outlier_method)
            if split == "train":
                N_trval["train"] = N_arr[mask]
                if C_trval: C_trval["train"] = C_arr[mask]
                y_trval["train"] = y_trval["train"][mask]

    X_train = np.hstack((N_trval["train"],)) 
    y_train = y_trval["train"]
    X_test = np.hstack((N_test["test"],))
    y_t = y_test["test"]

    X_train = simple_preprocess(X_train)
    X_test = simple_preprocess(X_test)

    task_type=info['task_type']
    if task_type!="regression":
        task_type = "classification"
    model = make_model(model_type, task_type)
    if task_type=="classification":
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_t = le.fit_transform(y_t)

    if model is None:
        raise ValueError(f"Model {args.model_type} not in models")

    if remove_outliers:
        mask_train = detect_and_log_outliers("train", X_train, dataset, outlier_method)
        X_train = X_train[mask_train]
        y_train = y_train[mask_train]

    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(len(X_test))


    evaluate_and_log(model, X_test, y_t, dataset, model_type, task_type, remove_outliers, outlier_method, log_dir=".")
if __name__ == "__main__":
    #if not already_logged()
    run_experiment(remove_outliers = "yes", outlier_method = "ZScore")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


295
[LOGGED] cmc, xgboost, 0.505085, ZScore


In [8]:
import sys

!{sys.executable} -m pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Using cached xgboost-3.0.3-py3-none-manylinux_2_28_x86_64.whl (253.8 MB)
Collecting nvidia-nccl-cu12
  Using cached nvidia_nccl_cu12-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.5 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.27.7 xgboost-3.0.3


In [60]:
import sys

!{sys.executable} -m pip install pyod

Defaulting to user installation because normal site-packages is not writeable
Collecting pyod
  Using cached pyod-2.0.5-py3-none-any.whl (200 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)
Collecting numba>=0.51
  Using cached numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)
Collecting llvmlite<0.45,>=0.44.0dev0
  Using cached llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.59.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.8 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (325 kB)
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Installing