In [1]:
import pandas as pd
import numpy as np

from execution_schemes import *
from main import *

from src import *
datasets = load_datasets()

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from pymoo.operators.sampling.rnd import BinaryRandomSampling
from pymoo.operators.crossover.hux import HUX
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.indicators.hv import Hypervolume
from pymoo.core.mutation import Mutation
from pymoo.core.sampling import Sampling
from pymoo.core.problem import Problem
from pymoo.optimize import minimize

from sklearn.model_selection import StratifiedShuffleSplit

from joblib import Parallel, delayed

from scipy.stats import ranksums

import pickle
import os
import re

import pandas as pd
import numpy as np




# Executions

In [3]:
class GenericOptimizer(Problem):
	population_size = 100
	n_neighbours = 5
	sequential = False
	def __init__(self, X_train, y_train, X_val, y_val, objectives, exec_mode):
		self.mutation_history = {}
		self.generation_number = 0

		self.exec_mode = exec_mode

		self.X_train = X_train
		self.y_train = y_train

		self.X_val = X_val
		self.y_val = y_val

		self.training_data = X_train
		self.n_instances = X_train.shape[0]
		
		self.objectives = objectives

		super().__init__(
			n_var=self.n_instances,
			n_obj=len(objectives),               
			n_constr=0,            
			xl=0,                  
			xu=1,                  
			type_var=np.bool_,     
		)

	def _evaluate(self, x, out, *args, **kwargs):
		
		if self.exec_mode == "sequential":
			metrics = []
			for objective in self.objectives:
				metrics.append(self.eval_objective((objective, x)))
		else:
			metrics = Parallel(n_jobs=-1)(delayed(self.eval_objective)((objective, x)) for objective in self.objectives)
		
		self.generation_number += 1

		out["F"] = np.column_stack(metrics)

	def eval_objective(self, pack):
		objective, x = pack
			
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(x)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in x:
				vals.append(GenericOptimizer.calculate_IR(self.y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in x:
				vals.append(objective(
					self.X_train[instance],
					self.y_train[instance],
					self.X_val,
					self.y_val,
					GenericOptimizer.n_neighbours
				))
			return vals

	@classmethod
	def calculate_IR(cls, y):
		df = pd.DataFrame(y).value_counts()
		return (df[1]/df[0]) if df.min() == 0 else (df[0]/df[1])
	
	@classmethod
	def filter_by_class(cls, x, y, label):
		indices = np.where(y==label)
		return x[indices], y[indices]
	
	@classmethod
	def calculate_overall_error(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			acc = accuracy_score(y_val, y_pred)
			return 1-acc
		else:
			return 1

	@classmethod
	def calculate_class0_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_class1_error(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		err = cls.calculate_overall_error(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return err

	@classmethod
	def calculate_overall_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]
		counts = pd.DataFrame(y_train).value_counts()
		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			f1 = f1_score(y_val, y_pred, average='weighted')
			return 1-f1
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1

	@classmethod
	def calculate_class1_inverse_f1(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_f1 = cls.calculate_overall_inverse_f1(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_f1
	
	@classmethod
	def calculate_overall_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			prec = precision_score(y_val, y_pred)
			return 1-prec
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec

	@classmethod
	def calculate_class1_inverse_precision(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_prec = cls.calculate_overall_inverse_precision(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_prec
		
	@classmethod
	def calculate_overall_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
				
		num_included_instances = x_train.shape[0]

		if num_included_instances >= n:
			optimization_knn = KNeighborsClassifier(n_neighbors=n)
			optimization_knn.fit(x_train, y_train)

			y_pred = optimization_knn.predict(x_val)
			recall = recall_score(y_val, y_pred)
			return 1-recall
		else:
			return 1

	@classmethod
	def calculate_class0_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 0)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall

	@classmethod
	def calculate_class1_inverse_recall(cls, x_train, y_train, x_val, y_val, n):
		class0_x_train, class0_y_train = cls.filter_by_class(x_train, y_train, 1)
		inv_recall = cls.calculate_overall_inverse_recall(
			class0_x_train,
			class0_y_train,
			x_val,
			y_val,
			n
		)
		return inv_recall
	
	@classmethod
	def calculate_num_examples(cls, instances):
		return np.sum(instances, axis=1)

	@classmethod
	def quantify_performance(cls, population, objectives, x_train, y_train, x_validation, y_validation, x_test, y_test):
		pass

	@classmethod
	def unbound_eval_objectives(cls, objective, instances, x_train, y_train, x_validation, y_validation):
		if "calculate_num_examples" in repr(objective):
			return GenericOptimizer.calculate_num_examples(instances)

		elif "calculate_IR" in repr(objective):
			vals = []
			for instance in instances:
				vals.append(GenericOptimizer.calculate_IR(y_train[instance]))
			return vals
		
		else:
			vals = []
			for instance in instances:
				vals.append(objective(
					x_train[instance],
					y_train[instance],
					x_validation,
					y_validation,
					GenericOptimizer.n_neighbours
				))
			return vals
		
	@classmethod
	def calculate_optimal_instance(cls, x_train, y_train, x_val, y_val, metrics, population, n):

		fronts = NonDominatedSorting().do(metrics, only_non_dominated_front=True)
		_, pareto_indicies = np.unique(metrics[fronts], axis=0, return_index=True)

		best_acc = 0
		best_instance = None
		for idx, instance in enumerate(population[pareto_indicies]):
			x_filtered, y_filtered = x_train[instance], y_train[instance]
			if x_filtered.shape[0] < n: 
				acc = 0
			else:
				knn = KNeighborsClassifier(n_neighbors=n)
				knn.fit(x_filtered, y_filtered)
				y_pred = knn.predict(x_val)
				acc = accuracy_score(y_val, y_pred)
			
				if acc > best_acc:
					best_acc = acc
					best_instance = instance
				
		return pareto_indicies, x_train[best_instance], y_train[best_instance]
	  
class BiasedBinarySampling(Sampling):
	def __init__(self, labels, major_prob, minor_prob):
		
		self.labels = labels
		counts = pd.DataFrame(labels).value_counts()
		if counts[0] > counts[1]:
			self.c0_thresh = major_prob
			self.c1_thresh = minor_prob
		else:
			self.c0_thresh = minor_prob
			self.c1_thresh = major_prob

		super().__init__()

	def _do(self, problem, n_samples, **kwargs):

		rands = np.random.random((n_samples, problem.n_var))
		init_pops = np.zeros((n_samples, problem.n_var), dtype=bool)
		for idx, label in enumerate(self.labels):
			if label == 0:
				init_pops[:, idx] = (rands[:, idx] < self.c0_thresh).astype(bool)
			if label == 1:
				init_pops[:, idx] = (rands[:, idx] < self.c1_thresh).astype(bool)


		return init_pops
	
def prepare_splits(x, y):
	train_split = StratifiedShuffleSplit(
		n_splits=31, 
		test_size=0.5
	)
	splits = []
	for train_idx, temp_idx in train_split.split(x, y):
		test_split = StratifiedShuffleSplit(
			n_splits=1, 
			test_size=0.5
		)
		test_idx, validation_idx = next(test_split.split(x[temp_idx], y[temp_idx]))

		validation_idx = temp_idx[validation_idx]
		test_idx = temp_idx[test_idx]
		
		splits.append((train_idx, validation_idx, test_idx))
	return splits

def create_preprocessor_pipeline(variables):
	
	type_mappings = {}
	for variable_idx, variable_name in enumerate(variables['name']):
		variable_type = variables['type'][variable_idx]
		if variable_type not in type_mappings:
			type_mappings[variable_type] = []

		if variables['role'][variable_idx] == 'Feature':
			type_mappings[variable_type].append(variable_name)

	categorical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='most_frequent')),
		('onehot', OneHotEncoder(handle_unknown='ignore'))
	])
	numerical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='mean')),
		('scaler', StandardScaler())
	])

	numerical_features = []
	if 'Continuous' in type_mappings:
		for feature in type_mappings['Continuous']:
			numerical_features.append(feature)
	if 'Integer' in type_mappings:
		for feature in type_mappings['Integer']:
			numerical_features.append(feature)
			
	transformer_steps = []
	if numerical_features != []:
		transformer_steps.append(
			('num', numerical_transformer, numerical_features)
		)
	if 'Categorical' in type_mappings:
		transformer_steps.append(
			('cat', categorical_transformer, type_mappings['Categorical'])
		)
	preprocessor = ColumnTransformer(
		transformers=transformer_steps
	)
	pipeline = Pipeline(steps=[
		('preprocessor', preprocessor)
	])
	
	return pipeline

def over_sample(x, y):
	counts = pd.DataFrame(y).value_counts()
	minority_class_label = counts.index[np.argmin(counts)][0]
	minority_class_indicies = np.where(y == minority_class_label)[0]
	# y = y.reshape(-1, 1)
	over_sampled_x = np.concatenate((x, x[minority_class_indicies]), axis=0)
	over_sampled_y = np.concatenate((y, y[minority_class_indicies]), axis=0)
	# over_sampled_y = over_sampled_y.reshape(-1)    
	return over_sampled_x, over_sampled_y

def overSample_randPop_execute(package):
	x, y, train_idx, validation_idx, test_idx, objectives, run_name = package

	if os.path.exists(os.path.join("results", f"{run_name}.pickle")):
		return run_name
	
	x_train, y_train = x[train_idx], y[train_idx]
	x_validation, y_validation = x[validation_idx], y[validation_idx]
	x_train, y_train = over_sample(
		x_train, 
		y_train
	)
	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"sequential"
	)	# BiasedBinarySampling(y_train, 0.5, 0.7)
	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BinaryRandomSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size),
		save_history=False
	)
	package = {
		"name": run_name,
		"train": train_idx,
		"validation": validation_idx,
		"test": test_idx,
		"result": result
	}
	return package

def regularSample_randPop_execute(package):
	x, y, train_idx, validation_idx, test_idx, objectives, run_name = package

	if os.path.exists(os.path.join("results", f"{run_name}.pickle")):
		return run_name

	x_train, y_train = x[train_idx], y[train_idx]
	x_validation, y_validation = x[validation_idx], y[validation_idx]
	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"sequential"
	)	# BiasedBinarySampling(y_train, 0.5, 0.7)
	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BinaryRandomSampling(), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size),
		save_history=False
	)
	package = {
		"name": run_name,
		"train": train_idx,
		"validation": validation_idx,
		"test": test_idx,
		"result": result
	}
	return package

def overSample_biasPop_execute(package):
	x, y, train_idx, validation_idx, test_idx, objectives, run_name = package

	if os.path.exists(os.path.join("results", f"{run_name}.pickle")):
		return run_name

	x_train, y_train = x[train_idx], y[train_idx]
	x_validation, y_validation = x[validation_idx], y[validation_idx]
	x_train, y_train = over_sample(
		x_train, 
		y_train
	)
	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"sequential"
	)
	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BiasedBinarySampling(y_train, 0.5, 0.7), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size),
		save_history=False
	)
	package = {
		"name": run_name,
		"train": train_idx,
		"validation": validation_idx,
		"test": test_idx,
		"result": result
	}
	return package

def regularSample_biasPop_execute(package):
	x, y, train_idx, validation_idx, test_idx, objectives, run_name = package

	if os.path.exists(os.path.join("results", f"{run_name}.pickle")):
		return run_name
	
	x_train, y_train = x[train_idx], y[train_idx]
	x_validation, y_validation = x[validation_idx], y[validation_idx]
	problem = GenericOptimizer(
		x_train, 
		y_train, 
		x_validation, 
		y_validation,
		objectives,
		"sequential"
	)	# BiasedBinarySampling(y_train, 0.5, 0.7)
	algorithm = NSGA2(
		pop_size=GenericOptimizer.population_size, 
		sampling=BiasedBinarySampling(y_train, 0.5, 0.7), 
		crossover=HUX(), 
		mutation=BitflipMutation(), 
		eliminate_duplicates=True,
	)
	result = minimize(
		problem, 
		algorithm, 
		('n_gen', GenericOptimizer.population_size),
		save_history=False
	)
	package = {
		"name": run_name,
		"train": train_idx,
		"validation": validation_idx,
		"test": test_idx,
		"result": result
	}
	return package

objectives_sets = evaluation_schemes = [
	[GenericOptimizer.calculate_overall_error, GenericOptimizer.calculate_num_examples],
	[GenericOptimizer.calculate_overall_error, GenericOptimizer.calculate_overall_inverse_f1, GenericOptimizer.calculate_num_examples],
	[GenericOptimizer.calculate_class0_error, GenericOptimizer.calculate_class1_error],
]

In [21]:
for dataset, name in datasets:
	
	raw_X, y = dataset.data.features, dataset.data.targets
	pipeline = create_preprocessor_pipeline(dataset.variables)
	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)
	
	packages = []
	for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
		for objectives in objectives_sets:
			objectives_names = [re.search(r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s', str(objective_name)).group(1) for objective_name in objectives]
			objectives_names = '_'.join(objectives_names)			
			packages.append((X, y, train_idx, validation_idx, test_idx, objectives, f"{c}_{name} {objectives_names} overSample_randPop"))
			
	# for package in packages:
	# 	if os.path.exists(os.path.join("results", f"{package[-1]}.pickle")):
	# 		continue
	# 	print(f"Executing {package[-1]}")
	# 	overSample_randPop_execute(package)

	results = Parallel(n_jobs=-1)(delayed(overSample_randPop_execute)(package) for package in packages)
	for result in results:
		if isinstance(result, str):
			print(f"Passed on: {result}")
			continue
		
		with open(os.path.join("results", f"{result['name']}.pickle"), 'wb') as fh:
			pickle.dump(result, fh)	

  y = column_or_1d(y, warn=True)


Passed on: 0_ionosphere calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_ionosphere calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_ionosphere calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_ionosphere calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_ionosphere calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_ionosphere calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_ionosphere calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_ion

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_iris0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_iris0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_iris0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_iris0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_iris0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_iris0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_iris0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_iris0 calculate_overall_error_calculate_overall_inver

  y = column_or_1d(y, warn=True)


Passed on: 0_glass1 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_glass1 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_glass1 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_glass1 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_glass1 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_glass1 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_glass1 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_glass1 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_glass1 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_glass1 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_glass1 calculate_overall_error_calculate_ov

  y = column_or_1d(y, warn=True)


Passed on: 0_australia calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_australia calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_australia calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_australia calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_australia calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_australia calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_australia calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_australia cal

  y = column_or_1d(y, warn=True)


Passed on: 0_thoracic calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_thoracic calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_thoracic calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_thoracic calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_thoracic calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_thoracic calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_thoracic calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_thoracic calculate_over

  y = column_or_1d(y, warn=True)


Passed on: 0_segment0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_segment0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_segment0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_segment0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_segment0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_segment0 calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_segment0 calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_segment0 calculate_over

  y = column_or_1d(y, warn=True)


Passed on: 0_wine calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_wine calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_wine calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_wine calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_wine calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_wine calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_wine calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_wine calculate_overall_error_calculate_overall_inverse_f1_calcu

  y = column_or_1d(y, warn=True)


Passed on: 0_ILPD calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_ILPD calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_ILPD calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_ILPD calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_ILPD calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_ILPD calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_ILPD calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_ILPD calculate_overall_error_calculate_overall_inverse_f1_calcu

  y = column_or_1d(y, warn=True)


Passed on: 0_heart_disease calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_heart_disease calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_heart_disease calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_heart_disease calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_heart_disease calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_heart_disease calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_heart_disease calculate_overall_error_calculate_num_examples overS

  y = column_or_1d(y, warn=True)


Passed on: 0_wisconsin calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 0_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 0_wisconsin calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 1_wisconsin calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 1_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 1_wisconsin calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 2_wisconsin calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 2_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_randPop
Passed on: 2_wisconsin calculate_class0_error_calculate_class1_error overSample_randPop
Passed on: 3_wisconsin calculate_overall_error_calculate_num_examples overSample_randPop
Passed on: 3_wisconsin cal

  y = column_or_1d(y, warn=True)


Passed on: 0_spambase calculate_overall_error_calculate_num_examples overSample_randPop


  y = column_or_1d(y, warn=True)


Passed on: 0_mammographic calculate_overall_error_calculate_num_examples overSample_randPop


  y = column_or_1d(y, warn=True)


Passed on: 0_yeast1 calculate_overall_error_calculate_num_examples overSample_randPop


  y = column_or_1d(y, warn=True)


Passed on: 0_coimbra calculate_overall_error_calculate_num_examples overSample_randPop


  y = column_or_1d(y, warn=True)


Passed on: 0_fertility calculate_overall_error_calculate_num_examples overSample_randPop


In [23]:
for dataset, name in datasets:
	
	raw_X, y = dataset.data.features, dataset.data.targets
	pipeline = create_preprocessor_pipeline(dataset.variables)
	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)
	
	packages = []
	for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
		for objectives in objectives_sets:
			objectives_names = [re.search(r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s', str(objective_name)).group(1) for objective_name in objectives]
			objectives_names = '_'.join(objectives_names)			
			packages.append((X, y, train_idx, validation_idx, test_idx, objectives, f"{c}_{name} {objectives_names} regSample_randPop"))

	# for package in packages:
	# 	if os.path.exists(os.path.join("results", f"{package[-1]}.pickle")):
	# 		continue
	# 	print(f"Executing {package[-1]}")
	# 	regularSample_randPop_execute(package)

	results = Parallel(n_jobs=-1)(delayed(regularSample_randPop_execute)(package) for package in packages)
	for result in results:
		if isinstance(result, str):
			print(f"Passed on: {result}")
			continue

		with open(os.path.join("results", f"{result['name']}.pickle"), 'wb') as fh:
			pickle.dump(result, fh)	

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_ionosphere calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_ionosphere calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_ionosphere calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_ionosphere calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_ionosphere calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_ionosphere calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_ionosphere calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_ionosphere ca

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_iris0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_iris0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_iris0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_iris0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_iris0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_iris0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_iris0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_iris0 calculate_overall_error_calculate_overall_inverse_f1_calc

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_australia calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_australia calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_australia calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_australia calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_australia calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_australia calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_australia calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_australia calculate_ove

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_segment0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_segment0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_segment0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_segment0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_segment0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_segment0 calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_segment0 calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_segment0 calculate_overall_error_

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_ILPD calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_ILPD calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_ILPD calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_ILPD calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_ILPD calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_ILPD calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_ILPD calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_e

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_wisconsin calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 0_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 0_wisconsin calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 1_wisconsin calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 1_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 1_wisconsin calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 2_wisconsin calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 2_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_randPop
Passed on: 2_wisconsin calculate_class0_error_calculate_class1_error regSample_randPop
Passed on: 3_wisconsin calculate_overall_error_calculate_num_examples regSample_randPop
Passed on: 3_wisconsin calculate_ove

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [25]:
for dataset, name in datasets:
	
	raw_X, y = dataset.data.features, dataset.data.targets
	pipeline = create_preprocessor_pipeline(dataset.variables)
	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)
	
	packages = []
	for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
		for objectives in objectives_sets:
			objectives_names = [re.search(r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s', str(objective_name)).group(1) for objective_name in objectives]
			objectives_names = '_'.join(objectives_names)			
			packages.append((X, y, train_idx, validation_idx, test_idx, objectives, f"{c}_{name} {objectives_names} overSample_biasPop"))

	# for package in packages:
	# 	if os.path.exists(os.path.join("results", f"{package[-1]}.pickle")):
	# 		continue
	# 	print(f"Executing {package[-1]}")
		# overSample_biasPop_execute(package)

	results = Parallel(n_jobs=-1)(delayed(overSample_biasPop_execute)(package) for package in packages)
	for result in results:
		if isinstance(result, str):
			print(f"Passed on: {result}")
			continue

		with open(os.path.join("results", f"{result['name']}.pickle"), 'wb') as fh:
			pickle.dump(result, fh)	

  y = column_or_1d(y, warn=True)


Passed on: 0_ionosphere calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_ionosphere calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_ionosphere calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_ionosphere calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_ionosphere calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_ionosphere calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_ionosphere calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_ion

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_iris0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_iris0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_iris0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_iris0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_iris0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_iris0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_iris0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_iris0 calculate_overall_error_calculate_overall_inver

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_australia calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_australia calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_australia calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_australia calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_australia calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_australia calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_australia calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_australia cal

  y = column_or_1d(y, warn=True)


Passed on: 0_thoracic calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_thoracic calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_thoracic calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_thoracic calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_thoracic calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_thoracic calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_thoracic calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_thoracic calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_thoracic calculate_over

  y = column_or_1d(y, warn=True)


Passed on: 0_segment0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_segment0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_segment0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_segment0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_segment0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_segment0 calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_segment0 calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_segment0 calculate_over

  y = column_or_1d(y, warn=True)


Passed on: 0_wine calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_wine calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_wine calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_wine calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_wine calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_wine calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_wine calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_wine calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_wine calculate_overall_error_calculate_overall_inverse_f1_calcu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_heart_disease calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_heart_disease calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_heart_disease calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_heart_disease calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_heart_disease calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_heart_disease calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_heart_disease calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_heart_disease calculate_overall_error_calculate_num_examples overS

  y = column_or_1d(y, warn=True)


Passed on: 0_wisconsin calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 0_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 0_wisconsin calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 1_wisconsin calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 1_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 1_wisconsin calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 2_wisconsin calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 2_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples overSample_biasPop
Passed on: 2_wisconsin calculate_class0_error_calculate_class1_error overSample_biasPop
Passed on: 3_wisconsin calculate_overall_error_calculate_num_examples overSample_biasPop
Passed on: 3_wisconsin cal

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [26]:
for dataset, name in datasets:
	
	raw_X, y = dataset.data.features, dataset.data.targets
	pipeline = create_preprocessor_pipeline(dataset.variables)
	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)
	
	packages = []
	for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
		for objectives in objectives_sets:
			objectives_names = [re.search(r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s', str(objective_name)).group(1) for objective_name in objectives]
			objectives_names = '_'.join(objectives_names)			
			packages.append((X, y, train_idx, validation_idx, test_idx, objectives, f"{c}_{name} {objectives_names} regSample_biasPop"))

	# for package in packages:
	# 	if os.path.exists(os.path.join("results", f"{package[-1]}.pickle")):
	# 		continue
	# 	print(f"Executing {package[-1]}")
	# 	overSample_biasPop_execute(package)

	results = Parallel(n_jobs=-1)(delayed(regularSample_biasPop_execute)(package) for package in packages)
	for result in results:
		if isinstance(result, str):
			print(f"Passed on: {result}")
			continue

		with open(os.path.join("results", f"{result['name']}.pickle"), 'wb') as fh:
			pickle.dump(result, fh)	


  y = column_or_1d(y, warn=True)


Passed on: 0_ionosphere calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_ionosphere calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_ionosphere calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_ionosphere calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_ionosphere calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_ionosphere calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_ionosphere calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_ionosphere calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_ionosphere ca

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_iris0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_iris0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_iris0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_iris0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_iris0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_iris0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_iris0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_iris0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_iris0 calculate_overall_error_calculate_overall_inverse_f1_calc

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_australia calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_australia calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_australia calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_australia calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_australia calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_australia calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_australia calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_australia calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_australia calculate_ove

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_segment0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_segment0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_segment0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_segment0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_segment0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_segment0 calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_segment0 calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_segment0 calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_segment0 calculate_overall_error_

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_ILPD calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_ILPD calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_ILPD calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_ILPD calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_ILPD calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_ILPD calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_ILPD calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_ILPD calculate_overall_error_calculate_overall_inverse_f1_calculate_num_e

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Passed on: 0_wisconsin calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 0_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 0_wisconsin calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 1_wisconsin calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 1_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 1_wisconsin calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 2_wisconsin calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 2_wisconsin calculate_overall_error_calculate_overall_inverse_f1_calculate_num_examples regSample_biasPop
Passed on: 2_wisconsin calculate_class0_error_calculate_class1_error regSample_biasPop
Passed on: 3_wisconsin calculate_overall_error_calculate_num_examples regSample_biasPop
Passed on: 3_wisconsin calculate_ove

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Calculations

In [4]:
def parallel_error(instance, x_train, y_train, x_compare, y_compare):
	x_filtered, y_filtered = x_train[instance], y_train[instance]
	if x_filtered.shape[0] < GenericOptimizer.n_neighbours: 
		error = 1
	else:
		knn = KNeighborsClassifier(n_neighbors=GenericOptimizer.n_neighbours)
		knn.fit(x_filtered, y_filtered)
		y_pred = knn.predict(x_compare)
		error = 1 - accuracy_score(y_compare, y_pred)
	return error

def calculate_metrics(x_train, y_train, x_validation, y_validation, x_test, y_test, result):
	baseline_validation_err = GenericOptimizer.calculate_overall_error(
		x_train, y_train,
		x_validation, y_validation,
		GenericOptimizer.n_neighbours
	)
	baseline_test_err = GenericOptimizer.calculate_overall_error(
		x_train, y_train,
		x_test, y_test,
		GenericOptimizer.n_neighbours
	)

	validation_F = Parallel(n_jobs=-1)(delayed(parallel_error)(instance, x_train, y_train, x_validation, y_validation) for instance in result.X)
	test_F = Parallel(n_jobs=-1)(delayed(parallel_error)(instance, x_train, y_train, x_test, y_test) for instance in result.X)
	ideal_validation = result.X[np.argmin(validation_F)]
	ideal_test = result.X[np.argmin(test_F)]
	
	validation_inclusions = np.sum(ideal_validation)
	test_inclusions = np.sum(ideal_test)

	# reduction_rate = 1 - (best_x_train.shape[0] / x_train.shape[0])
	# optimized_ir = GenericOptimizer.calculate_IR(best_y_train)
	optimized_validation_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_validation],
		y_train[ideal_validation],
		x_validation,
		y_validation,
		GenericOptimizer.n_neighbours
	)
	optimized_test_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_validation],
		y_train[ideal_validation],
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)
	ideal_optimized_test_err = GenericOptimizer.calculate_overall_error(
		x_train[ideal_test],
		y_train[ideal_test],
		x_test,
		y_test,
		GenericOptimizer.n_neighbours
	)
	return validation_inclusions, test_inclusions, baseline_validation_err, baseline_test_err, optimized_validation_err, optimized_test_err, ideal_optimized_test_err


In [5]:
save_path = "results"

iteration_mappings = {}
for file in os.listdir("results"):
	execution_name = "_".join(file.replace(".pickle", "").split("_")[1:])
	if execution_name not in iteration_mappings:
		iteration_mappings[execution_name] = []
	iteration_mappings[execution_name].append(file)

In [17]:
rows = []

for execution_name in iteration_mappings:

	optimized_validation_acc = []
	optimized_test_acc = []
	optimized_ideal_test_acc = []

	baseline_test_acc = []
	baseline_validaion_acc = []

	validation_inclusions = []
	test_inclusions = []
	curr_dataset = execution_name.split(" ")[0].strip()
	for dataset, name in datasets:
		if name == curr_dataset:
			raw_X, y = dataset.data.features, dataset.data.targets
			pipeline = create_preprocessor_pipeline(dataset.variables)
			pipeline.fit(raw_X, y)
			X = pipeline.transform(raw_X)
			label_encoder = LabelEncoder()
			y = label_encoder.fit_transform(y)
			break
	
	for filename in iteration_mappings[execution_name]:
		with open(os.path.join(save_path, filename), 'rb') as fh:
			result_dict = pickle.load(fh)

		train_idx = result_dict['train']
		validation_idx = result_dict['validation']
		test_idx = result_dict['test']
		result = result_dict['result']
		run_name = result_dict['name']

		x_train, y_train = X[train_idx], y[train_idx]
		x_validation, y_validation = X[validation_idx], y[validation_idx]
		x_test, y_test = X[test_idx], y[test_idx]

		if "overSample" in run_name:
			x_train, y_train = over_sample(
				x_train, 
				y_train
			)
		
		num_validation, num_test, baseline_validation_err, baseline_test_err, optimized_validation_err, optimized_test_err, ideal_optimized_test_err = calculate_metrics(
			x_train, 
			y_train, 
			x_validation, 
			y_validation, 
			x_test, 
			y_test, 
			result
		)

		validation_inclusions.append(num_validation)
		test_inclusions.append(num_test)

		baseline_validaion_acc.append(1-baseline_validation_err)
		baseline_test_acc.append(1-baseline_test_err)
		optimized_validation_acc.append(1-optimized_validation_err)
		optimized_test_acc.append(1-optimized_test_err)
		optimized_ideal_test_acc.append(1-ideal_optimized_test_err)

	val_pval = ranksums(baseline_validaion_acc, optimized_validation_acc).pvalue
	test_pval = ranksums(baseline_test_acc, optimized_test_acc).pvalue
	ideal_test_pval = ranksums(baseline_test_acc, optimized_ideal_test_acc).pvalue
	print(X.shape[0], num_test)
	row = {
		"Dataset": curr_dataset,
		"Sampling": "over sampling" if "overSample" in execution_name else "regular sampling",
		"Population": "random population" if "randPop" in execution_name else "biased population",
		"Total Size": X.shape[0],
		"Optimized Validation Size": validation_inclusions,
		"Optimized Test Size": test_inclusions,
		"Validation Baseline Acc": baseline_validaion_acc,
		"Test Baseline Acc": baseline_test_acc,
		"Optimized Validation Acc": optimized_validation_acc,
		"Optimized Test Acc": optimized_test_acc,
		"Ideal Test Acc": optimized_ideal_test_acc,
		"Validation P-value": val_pval,
		"Test P-value": test_pval,
		"Ideal Test P-value": ideal_test_pval
	}
	rows.append(row)
		



  y = column_or_1d(y, warn=True)


690 267


  y = column_or_1d(y, warn=True)


690 269


  y = column_or_1d(y, warn=True)


690 198


  y = column_or_1d(y, warn=True)


690 172


  y = column_or_1d(y, warn=True)


690 68


  y = column_or_1d(y, warn=True)


690 51


  y = column_or_1d(y, warn=True)


690 46


  y = column_or_1d(y, warn=True)


690 37


  y = column_or_1d(y, warn=True)


690 111


  y = column_or_1d(y, warn=True)


690 117


  y = column_or_1d(y, warn=True)


690 52


  y = column_or_1d(y, warn=True)


690 48


  y = column_or_1d(y, warn=True)


116 45


  y = column_or_1d(y, warn=True)


116 38


  y = column_or_1d(y, warn=True)


116 31


  y = column_or_1d(y, warn=True)


116 31


  y = column_or_1d(y, warn=True)


116 6


  y = column_or_1d(y, warn=True)


116 13


  y = column_or_1d(y, warn=True)


116 7


  y = column_or_1d(y, warn=True)


116 6


  y = column_or_1d(y, warn=True)


116 14


  y = column_or_1d(y, warn=True)


116 14


  y = column_or_1d(y, warn=True)


116 8


  y = column_or_1d(y, warn=True)


116 7


  y = column_or_1d(y, warn=True)


100 32


  y = column_or_1d(y, warn=True)


100 28


  y = column_or_1d(y, warn=True)


100 28


  y = column_or_1d(y, warn=True)


100 27


  y = column_or_1d(y, warn=True)


100 7


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


100 5


  y = column_or_1d(y, warn=True)


214 52


  y = column_or_1d(y, warn=True)


214 60


  y = column_or_1d(y, warn=True)


214 53


  y = column_or_1d(y, warn=True)


214 61


  y = column_or_1d(y, warn=True)


214 20


  y = column_or_1d(y, warn=True)


214 26


  y = column_or_1d(y, warn=True)


214 14


  y = column_or_1d(y, warn=True)


214 35


  y = column_or_1d(y, warn=True)


214 21


  y = column_or_1d(y, warn=True)


214 12


  y = column_or_1d(y, warn=True)


214 12


  y = column_or_1d(y, warn=True)


214 28


  y = column_or_1d(y, warn=True)


306 103


  y = column_or_1d(y, warn=True)


306 99


  y = column_or_1d(y, warn=True)


306 75


  y = column_or_1d(y, warn=True)


306 65


  y = column_or_1d(y, warn=True)


306 5


  y = column_or_1d(y, warn=True)


306 6


  y = column_or_1d(y, warn=True)


306 10


  y = column_or_1d(y, warn=True)


306 6


  y = column_or_1d(y, warn=True)


306 6


  y = column_or_1d(y, warn=True)


306 7


  y = column_or_1d(y, warn=True)


306 44


  y = column_or_1d(y, warn=True)


306 8


  y = column_or_1d(y, warn=True)


303 89


  y = column_or_1d(y, warn=True)


303 78


  y = column_or_1d(y, warn=True)


303 70


  y = column_or_1d(y, warn=True)


303 74


  y = column_or_1d(y, warn=True)


303 6


  y = column_or_1d(y, warn=True)


303 8


  y = column_or_1d(y, warn=True)


303 26


  y = column_or_1d(y, warn=True)


303 19


  y = column_or_1d(y, warn=True)


303 17


  y = column_or_1d(y, warn=True)


303 8


  y = column_or_1d(y, warn=True)


303 40


  y = column_or_1d(y, warn=True)


303 30


  y = column_or_1d(y, warn=True)


583 212


  y = column_or_1d(y, warn=True)


583 195


  y = column_or_1d(y, warn=True)


583 143


  y = column_or_1d(y, warn=True)


583 147


  y = column_or_1d(y, warn=True)


583 59


  y = column_or_1d(y, warn=True)


583 60


  y = column_or_1d(y, warn=True)


583 16


  y = column_or_1d(y, warn=True)


583 26


  y = column_or_1d(y, warn=True)


583 99


  y = column_or_1d(y, warn=True)


583 77


  y = column_or_1d(y, warn=True)


583 51


  y = column_or_1d(y, warn=True)


583 8


  y = column_or_1d(y, warn=True)


351 141


  y = column_or_1d(y, warn=True)


351 117


  y = column_or_1d(y, warn=True)


351 95


  y = column_or_1d(y, warn=True)


351 86


  y = column_or_1d(y, warn=True)


351 35


  y = column_or_1d(y, warn=True)


351 31


  y = column_or_1d(y, warn=True)


351 23


  y = column_or_1d(y, warn=True)


351 11


  y = column_or_1d(y, warn=True)


351 13


  y = column_or_1d(y, warn=True)


351 23


  y = column_or_1d(y, warn=True)


351 10


  y = column_or_1d(y, warn=True)


351 11


  y = column_or_1d(y, warn=True)


150 52


  y = column_or_1d(y, warn=True)


150 55


  y = column_or_1d(y, warn=True)


150 34


  y = column_or_1d(y, warn=True)


150 36


  y = column_or_1d(y, warn=True)


150 10


  y = column_or_1d(y, warn=True)


150 8


  y = column_or_1d(y, warn=True)


150 10


  y = column_or_1d(y, warn=True)


150 10


  y = column_or_1d(y, warn=True)


150 9


  y = column_or_1d(y, warn=True)


150 9


  y = column_or_1d(y, warn=True)


150 9


  y = column_or_1d(y, warn=True)


150 9


  y = column_or_1d(y, warn=True)


961 418


  y = column_or_1d(y, warn=True)


961 334


  y = column_or_1d(y, warn=True)


961 283


  y = column_or_1d(y, warn=True)


961 240


  y = column_or_1d(y, warn=True)


961 176


  y = column_or_1d(y, warn=True)


961 112


  y = column_or_1d(y, warn=True)


961 95


  y = column_or_1d(y, warn=True)


961 76


  y = column_or_1d(y, warn=True)


961 193


  y = column_or_1d(y, warn=True)


961 104


  y = column_or_1d(y, warn=True)


961 95


  y = column_or_1d(y, warn=True)


961 49


  y = column_or_1d(y, warn=True)


210 42


  y = column_or_1d(y, warn=True)


210 61


  y = column_or_1d(y, warn=True)


210 48


  y = column_or_1d(y, warn=True)


210 55


  y = column_or_1d(y, warn=True)


210 12


  y = column_or_1d(y, warn=True)


210 27


  y = column_or_1d(y, warn=True)


210 13


  y = column_or_1d(y, warn=True)


210 23


  y = column_or_1d(y, warn=True)


210 23


  y = column_or_1d(y, warn=True)


210 35


  y = column_or_1d(y, warn=True)


210 21


  y = column_or_1d(y, warn=True)


210 32


  y = column_or_1d(y, warn=True)


4601 1847


  y = column_or_1d(y, warn=True)


4601 1594


  y = column_or_1d(y, warn=True)


4601 1325


  y = column_or_1d(y, warn=True)


4601 1132


  y = column_or_1d(y, warn=True)


4601 1362


  y = column_or_1d(y, warn=True)


4601 1077


  y = column_or_1d(y, warn=True)


4601 941


  y = column_or_1d(y, warn=True)


4601 766


  y = column_or_1d(y, warn=True)


4601 1342


  y = column_or_1d(y, warn=True)


4601 995


  y = column_or_1d(y, warn=True)


4601 901


  y = column_or_1d(y, warn=True)


4601 851


  y = column_or_1d(y, warn=True)


470 137


  y = column_or_1d(y, warn=True)


470 122


  y = column_or_1d(y, warn=True)


470 123


  y = column_or_1d(y, warn=True)


470 123


  y = column_or_1d(y, warn=True)


470 26


  y = column_or_1d(y, warn=True)


470 5


  y = column_or_1d(y, warn=True)


470 5


  y = column_or_1d(y, warn=True)


470 5


  y = column_or_1d(y, warn=True)


470 13


  y = column_or_1d(y, warn=True)


470 10


  y = column_or_1d(y, warn=True)


470 20


  y = column_or_1d(y, warn=True)


470 12


  y = column_or_1d(y, warn=True)


178 45


  y = column_or_1d(y, warn=True)


178 53


  y = column_or_1d(y, warn=True)


178 34


  y = column_or_1d(y, warn=True)


178 48


  y = column_or_1d(y, warn=True)


178 12


  y = column_or_1d(y, warn=True)


178 9


  y = column_or_1d(y, warn=True)


178 10


  y = column_or_1d(y, warn=True)


178 9


  y = column_or_1d(y, warn=True)


178 9


  y = column_or_1d(y, warn=True)


178 9


  y = column_or_1d(y, warn=True)


178 10


  y = column_or_1d(y, warn=True)


178 9


  y = column_or_1d(y, warn=True)


569 226


  y = column_or_1d(y, warn=True)


569 199


  y = column_or_1d(y, warn=True)


569 167


  y = column_or_1d(y, warn=True)


569 147


  y = column_or_1d(y, warn=True)


569 7


  y = column_or_1d(y, warn=True)


569 7


  y = column_or_1d(y, warn=True)


569 17


  y = column_or_1d(y, warn=True)


569 6


  y = column_or_1d(y, warn=True)


569 17


  y = column_or_1d(y, warn=True)


569 12


  y = column_or_1d(y, warn=True)


569 8


  y = column_or_1d(y, warn=True)


569 10


  y = column_or_1d(y, warn=True)


1484 158


  y = column_or_1d(y, warn=True)


1484 380


  y = column_or_1d(y, warn=True)


1484 155


  y = column_or_1d(y, warn=True)


1484 377


  y = column_or_1d(y, warn=True)


1484 36


  y = column_or_1d(y, warn=True)


1484 157


  y = column_or_1d(y, warn=True)


1484 25


  y = column_or_1d(y, warn=True)


1484 165


  y = column_or_1d(y, warn=True)


1484 19


  y = column_or_1d(y, warn=True)


1484 209


  y = column_or_1d(y, warn=True)


1484 49


  y = column_or_1d(y, warn=True)


1484 191


In [55]:

for dataset, name in datasets:
	
	raw_X, y = dataset.data.features, dataset.data.targets
	pipeline = create_preprocessor_pipeline(dataset.variables)
	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)

	print(name)
	print(pd.DataFrame(y).value_counts())
	print("\n\n")

ionosphere
0
1    225
0    126
Name: count, dtype: int64



haberman
0
0    225
1     81
Name: count, dtype: int64



iris0
0
0    50
1    50
2    50
Name: count, dtype: int64



glass1
0
1    76
0    70
5    29
2    17
3    13
4     9
Name: count, dtype: int64



australia
0
0    383
1    307
Name: count, dtype: int64



thoracic
0
0    400
1     70
Name: count, dtype: int64



segment0
0
0    30
1    30
2    30
3    30
4    30
5    30
6    30
Name: count, dtype: int64



wine
0
1    71
0    59
2    48
Name: count, dtype: int64



ILPD
0
0    416
1    167
Name: count, dtype: int64



heart_disease
0
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64



wisconsin
0
0    357
1    212
Name: count, dtype: int64



spambase
0
0    2788
1    1813
Name: count, dtype: int64



mammographic
0
0    516
1    445
Name: count, dtype: int64



yeast1
0
0    463
7    429
6    244
5    163
4     51
3     44
2     35
9     30
8     20
1      5
Name: count, dtype: int64



coimbra
0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [1]:
obj_by_idx = []
for name in iteration_mappings:
	obj_by_idx.append(name.split(' ')[1])

records = []
for idx, title in enumerate(iteration_mappings):
	record = rows[idx]
	dataset = None

	for folder in os.listdir("Datasets"):
		if folder in title:
			dataset = folder
			break

	records.append({
		"Dataset": record['Dataset'],
		"Sampling": record['Sampling'],
		"Population": record['Population'],
		"Objectives": obj_by_idx[idx],
		"Mean Validation Set": np.mean(record['Optimized Validation Size']),
		"Total Size": record['Total Size'],
		"Mean Test Set": np.mean(record['Optimized Test Size']),
		"Baseline validation acc": np.mean(record['Validation Baseline Acc']),
		"Baseline test acc": np.mean(record['Test Baseline Acc']),
		"Optimized validation acc": np.mean(record['Optimized Validation Acc']),
		"Optimized test acc": np.mean(record['Optimized Test Acc']),
		"Optimized ideal test acc": np.mean(record['Ideal Test Acc']),
		"Validation PVal": record['Validation P-value'],
		"Test PVal": record['Test P-value'],
		"Ideal Test PVal": record['Ideal Test P-value'],
		"Validation Diff": np.mean(np.subtract(record['Optimized Validation Acc'], record['Validation Baseline Acc'])),
		"Test Diff": np.mean(np.subtract(record['Optimized Test Acc'], record['Test Baseline Acc'])),
		"Ideal Diff": np.mean(np.subtract(record['Ideal Test Acc'], record['Test Baseline Acc']))
	})
pd.DataFrame.from_records(records).to_excel("ALL_METRICS.xlsx", index=False)
metrics = pd.DataFrame.from_records(records)

NameError: name 'iteration_mappings' is not defined

In [6]:
metrics = pd.read_excel('ALL_METRICS.xlsx')

In [7]:
dataset_names = []
for _, name in datasets:
    dataset_names.append(name)
len(dataset_names)

16

In [15]:
keep_idx = np.where(metrics['Ideal Test PVal'] <= 0.05)
sig_df = metrics.iloc[keep_idx]

winners = []
for name in dataset_names:
	dataset_idx = np.where(sig_df['Dataset'] == name)
	dataset_df = sig_df.iloc[dataset_idx]
	bias_idx = np.where(dataset_df['Population'] == 'biased population')
	dataset_df = dataset_df.iloc[bias_idx]
	dataset_df = dataset_df.sort_values('Ideal Diff')

	reduction_rate = (dataset_df['Total Size'] - dataset_df['Mean Test Set']) / dataset_df['Total Size']
	dataset_df['Reduction Rate'] = reduction_rate
	winners.append(dataset_df.iloc[-1]	)


df = pd.DataFrame(winners).to_excel("FINAL_WINNERS.xlsx", index=False)
df
# print(df['Sampling'].value_counts(), "\n")
# print(df['Population'].value_counts(), '\n')
# print(df['Objectives'].value_counts(), '\n')