In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
import pandas as pd
import numpy as np
import pickle
import os
import re

# Create data package

In [2]:
def load_KEEL_dataset(path):
	with open(path, 'r') as fh:
		lines = fh.readlines()
	
	relation_name = ''
	attributes = []
	attribute_types = {}
	data_lines = []
	in_data_section = False

	for line in lines:
		line = line.strip().lower()
		if line.startswith('@relation'):
			relation_name = line.split()[1]
		elif line.startswith('@attribute'):
			# Match attribute lines with types and optional ranges or enumerations
			match = re.match(r'@attribute\s+(\w+)\s+(\w+)(?:\s+\[.*?\])?', line)
			if match:
				attr_name, attr_type = match.groups()
				attributes.append(attr_name)
				attribute_types[attr_name] = attr_type
			else:
				# Match attribute lines with enumerated types
				match_enum = re.match(r'@attribute\s+(\w+)\s+\{.*?\}', line)
				if match_enum:
					attr_name = match_enum.group(1)
					attributes.append(attr_name)
					attribute_types[attr_name] = 'categorical'
		elif line.startswith('@data'):
			in_data_section = True
		elif in_data_section:
			if line and not line.startswith('@'):
				data_lines.append(line)

	# Create DataFrame from data lines
	data_str = '\n'.join(data_lines)
	df = pd.read_csv(StringIO(data_str), header=None, names=attributes)

	return df, attribute_types

def create_KEEL_preprocessor_pipeline(attributes):
	type_mappings = {}
	categorical_features = []
	numerical_features = []

	for column in attributes:
		if column != "Class" and attributes[column] != 'categorical':
			numerical_features.append(column)
		elif column != "Class" and attributes[column] == 'categorical':
			categorical_features.append(column)

	categorical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='most_frequent')),
		('onehot', OneHotEncoder(handle_unknown='ignore'))
	])
	numerical_transformer = Pipeline(steps=[
		('imputer', SimpleImputer(strategy='mean')),
		('scaler', StandardScaler())
	])
			
	transformer_steps = []
	if numerical_features != []:
		transformer_steps.append(
			('num', numerical_transformer, numerical_features)
		)
	if 'Categorical' in type_mappings:
		transformer_steps.append(
			('cat', categorical_transformer, categorical_features)
		)
	preprocessor = ColumnTransformer(
		transformers=transformer_steps
	)
	pipeline = Pipeline(steps=[
		('preprocessor', preprocessor)
	])

	return pipeline

def prepare_splits(x, y):
	train_split = StratifiedShuffleSplit(
		n_splits=31, 
		test_size=0.5
	)
	splits = []
	for train_idx, temp_idx in train_split.split(x, y):
		test_split = StratifiedShuffleSplit(
			n_splits=1, 
			test_size=0.5
		)
		test_idx, validation_idx = next(test_split.split(x[temp_idx], y[temp_idx]))

		validation_idx = temp_idx[validation_idx]
		test_idx = temp_idx[test_idx]
		
		splits.append((train_idx, validation_idx, test_idx))
	return splits


In [3]:
data_mapper = {}
for dat_file in Path('datasets').rglob('*.dat'):

	if "MACOSX" in str(dat_file):
		continue

	name = str(dat_file.name).replace(".dat", "")

	dataset, attributes = load_KEEL_dataset(dat_file)

	y = dataset['class']	
	raw_X = dataset.drop(columns=['class'])
	
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)

	pipeline = create_KEEL_preprocessor_pipeline(attributes)

	pipeline.fit(raw_X, y)
	X = pipeline.transform(raw_X)
	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(y)

	for c, (train_idx, validation_idx, test_idx) in enumerate(prepare_splits(X, y)):
		data_mapper[f"{c}_{name}"] = {
			'Dataset': name,
			'Version': c,
			'x_train': X[train_idx],
			'y_train': y[train_idx],
			'x_validation': X[validation_idx],
			'y_validation': y[validation_idx],
			'x_test': X[test_idx],
			'y_test': y[test_idx],
		}

with open('data.pickle', 'wb') as fh:
	pickle.dump(data_mapper, fh)

# Inspect data package

In [2]:
with open('data.pickle', 'rb') as fh:
	data_mapper = pickle.load(fh)

In [4]:
split_by_dataset_name = {}
for data_key in data_mapper:
	segments = data_key.split('_')
	split_num = segments[0]
	dataset_name = '_'.join(segments[1:])
	
	if dataset_name not in split_by_dataset_name:
		split_by_dataset_name[dataset_name] = []
	
	split_by_dataset_name[dataset_name].append(data_key)

pd.DataFrame.from_records(split_by_dataset_name).to_csv('data_splits.csv', index=False)

In [5]:
splits = pd.read_csv('data_splits.csv')