In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from src.model.dkdn import *
from src.model.instance_hardness import *
from src.model.support_subset import *
from src.utils import *

In [20]:
experiment = 'australian'
print(f'Experiment: {experiment}\n')

results_folder = '../results/incremental'

os.makedirs(results_folder, exist_ok=True)

data = pd.read_parquet(f'../data/{experiment}.parquet')

exp_info = {experiment:{}}

# Preprocessing
scaler = StandardScaler()
X = scaler.fit_transform(data.drop(columns=['y']))
y = data.y.values
y[y == -1] = 0
y = y.astype(int)

# Save test to evaluate models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

for incr in range(5, 10):
# Incremental data
incr = 0.2

# Division initial set and incremental set
X_ini, X_incr, y_ini, y_incr = train_test_split(X_train, y_train, test_size=incr, stratify=y_train, random_state=42)

# random seed for random methods
rng_seed = 1234

# Heuristic thresholds
print('Heuristic thresholds computation ... ')
complexity_ini, higher_complexity_ini = complexity_high_class(X_ini, y_ini)
thresholds_ini = expected_performance_thresholds(higher_complexity_ini)

exp_info[experiment][incr] = {}

# Read data info
with open(f'../results/sampling/{experiment}.json', 'r') as fin:
                exp_summary = json.load(fin)
methods = [SVC, KNeighborsClassifier, RandomForestClassifier]

for method in methods:
    # Method setup
    str_method = str(method())[:-2]
    params = exp_summary[str_method]['best_params']
    clf = method(**params)

    # Support subset estimation
    print('Support subset estimation ... ')
    ss_idx, ini_performance = sampling_heuristic(complexity_ini, X_ini, y_ini, clf, thresholds_ini, random_state=rng_seed, verbose=True)

    # Incremental data evaluation
    clf.fit(X_ini[ss_idx], y[ss_idx])
    pred_incr = clf.predict(X_incr)

    # Incremental data sampling
    if not (scaled_mcc(y_incr, pred_incr) > ini_performance) | (scaled_mcc(y_incr, pred_incr) > thresholds_ini[0]):
        print('Incremental data thresholds computation ...')
        complexity_incr, higher_complexity_incr = complexity_high_class(X_incr, y_incr)
        thresholds_incr = expected_performance_thresholds(higher_complexity_incr)
        print('Incremental sampling ...')
        incr_idx, incr_performance = sampling_heuristic(complexity_incr, X_incr, y_incr, clf, thresholds_incr, random_state=rng_seed, verbose=True)
        # New data to train model
        X_new = np.append(X_ini[ss_idx], X_incr[incr_idx], axis=0)
        y_new = np.append(y_ini[ss_idx], y_incr[incr_idx], axis=0)
    else:
        X_new = X_ini[ss_idx]
        y_new = y_ini[ss_idx]

    # Train new model
    clf.fit(X_new, y_new)

    # Performances computation
    new_performance = scaled_mcc(y_new, clf.predict(X_new))
    incr_performance = scaled_mcc(y_incr, clf.predict(X_incr))
    test_performance = scaled_mcc(y_test, clf.predict(X_test))

    method_info = {'proporcion': len(X_new)/len(X_train),
        'test goal': exp_summary[str_method]['test_score'],
        'test performance': test_performance,
        'new performance': new_performance,
        'ini performance': ini_performance,
        'incr performance': incr_performance,
        'ini thresholds': thresholds_ini,
        'incr thresholds': thresholds_incr}
    print(f'{str_method}: {method_info}')
    exp_info[experiment][str_method] = method_info

Experiment: australian

Heuristic thresholds computation ... 
Support subset estimation ... 
0.1
0.2


KeyboardInterrupt: 

0.5
0.4
0.3
0.2
0.1
