## Imports

In [1]:
from sklearn.impute import SimpleImputer
import numpy as np
import re
import datetime as dt

## Functions for preprocessing

In [2]:
def convert_to_numeric(vocabulary, line_arr, ignore=[], nanval=""):
	ret = []

	for i, attr in enumerate(line_arr):
		if i in ignore:
			ret.append(attr)
			continue

		if attr.replace(".","").isnumeric():
			if "." in attr:
				ret.append(float(attr))
			else:
				ret.append(int(attr))
			continue

		if i not in vocabulary:
			vocabulary[i] = {}
			if nanval != "":
				vocabulary[i][nanval] = -1

		if attr not in vocabulary[i]:
			vocabulary[i][attr] = int(list(vocabulary[i].values())[-1] + 1 if len(vocabulary[i]) > 0 else 0)

		ret.append(vocabulary[i][attr])

	return ret

def one_to_n(data, columns):
	# find number of elements per column
	elnum = [0] * len(columns)
	for line in data:
		for i in range(len(columns)):
			elnum[i] = max(line[columns[i]]+1, elnum[i])

	# convert columns
	ret = []
	for line in data:
		prev = 0
		arr = []
		for i in range(len(columns)):
			conv = [0] * elnum[i]
			conv[line[columns[i]]] = 1

			arr += list(line[prev:columns[i]]) + conv
			prev = columns[i] + 1
		arr += list(line[prev:])
		ret.append(arr)

	return ret

def convert_date_to_unix_time(data, columns):
	epoch = dt.datetime.utcfromtimestamp(0)

	ret = []
	for line in data:
		prev = 0
		arr = []
		for i in range(len(columns)):
			conv = int((dt.datetime.strptime(line[columns[i]], "%Y-%m-%d") - epoch).total_seconds() * 1000.0)

			arr += list(line[prev:columns[i]]) + [conv]
			prev = columns[i] + 1
		arr += list(line[prev:])
		ret.append(arr)

	return ret

def join_array(array, seperator=";"):
	if len(array) > 0:
		ret = str(array[0])
	else:
		return ""

	for i in array[1:]:
		ret += ";" + str(i)

	return ret

## Solar flares

For this dataset a conversion to numeric values and one_to_n encoding was performed.

In [3]:
def convert_solar_flairs():
	data = []
	vocabulary = {}

	# open file
	with open("../datasets/solarflares/flare.data", "r") as f:
		fl = True
		for l in f:
			if fl: fl = False; continue
			line_arr = l.strip().split(" ")
			line_arr = convert_to_numeric(vocabulary, line_arr, nanval="")
			data.append(line_arr)

	data = one_to_n(data, [0,1,2])

	# save converted
	with open("../datasets/solarflares/flare_conv.data", "w") as f:
		for d in data:
			f.write(join_array(d) + "\n")

	# save input data
	with open("../datasets/solarflares/flare_input.data", "w") as f:
		for d in data:
			f.write(join_array(d[0:-3]) + "\n")

	# save class ids
	with open("../datasets/solarflares/flare_classes.data", "w") as f:
		for d in data:
			f.write(join_array(d[-3:]) + "\n")

## Wine

Since the wine dataset consists only of numeric values there was no need for preprocessing appart of splitting into input and output file.

In [4]:
def convert_wine():
	for s in ["red", "white"]:
		data = []
		vocabulary = {}

		# open file
		with open("../datasets/wine/winequality-" + s + ".csv", "r") as f:
			fl = True
			for l in f:
				if fl: fl = False; continue
				line_arr = l.strip().split(";")
				line_arr = convert_to_numeric(vocabulary, line_arr, nanval="")
				data.append(line_arr)

		# save converted
		with open("../datasets/wine/wine_" + s + "_conv.data", "w") as f:
			for d in data:
				f.write(join_array(d) + "\n")

		# save input data
		with open("../datasets/wine/wine_" + s + "_input.data", "w") as f:
			for d in data:
				f.write(join_array(d[0:-1]) + "\n")

		# save class ids
		with open("../datasets/wine/wine_" + s + "_classes.data", "w") as f:
			for d in data:
				f.write(str(d[-1]) + "\n")

## Covid

The covid dataset consists of numeric, date and categorical values. For the categorical values, a one-to-n encoding was performed, the date values were converted to unix time and the numerical values were normalizen using the population of the variouse countries.

In [5]:
def convert_covid():
	data = []
	vocabulary = {}

	# open file
	with open("../datasets/covid/covid-vaccination-vs-death_ratio.csv", "r") as f:
		fl = True
		for l in f:
			if fl: fl = False; continue
			line_arr = l.strip().replace(", ", " ").split(",")
			line_arr = convert_to_numeric(vocabulary, line_arr[1:], ignore=[2], nanval="")
			data.append(line_arr)

	data = convert_date_to_unix_time(data, [2])

	for line in data:
		line[3] = line[3]/line[7]
		line[4] = line[4]/line[7]
		line[5] = line[5]/line[7]
		line[6] = line[6]/line[7]


	data = one_to_n(data, [0,1])

	# save converted
	with open("../datasets/covid/covid-vaccination-vs-death_ratio_conv.data", "w") as f:
		for d in data:
			f.write(join_array(d) + "\n")

	# save input data
	with open("../datasets/covid/covid-vaccination-vs-death_ratio_input.data", "w") as f:
		for d in data:
			f.write(join_array(d[0:-1]) + "\n")

	# save class ids
	with open("../datasets/covid/covid-vaccination-vs-death_ratio_classes.data", "w") as f:
		for d in data:
			f.write(str(d[-1]) + "\n")

In [6]:
convert_solar_flairs()
print("converted solar flares dataset")
convert_wine()
print("converted wine dataset")
convert_covid()
print("converted covid dataset")

converted solar flares dataset
converted wine dataset
converted covid dataset
