We begin by cleaning the datasets by dropping null values, selecting our features, and encoding them as required. Further, to maintain a similar structure in all datasets, we move the label (predictor) to the last column.

Prior to this, I've manually cleaned out the NAs from the aus_rain dataset (replacing them with nulls) and unwrapped the quotes(") from around each value.

In [69]:
! rmdir /S /Q datasets
! mkdir datasets

The system cannot find the file specified.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
F = {
	"STD": 0,
	"DROP": 1,
	"LABEL": 2,
	"ONEHOT": 3,
	"DUMMY": 4,
	"RAW": 5,
}

sets = {
	"anemia": [
		F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["RAW"], F["RAW"], F["STD"], F["ONEHOT"], F["DUMMY"], F["DUMMY"], F["ONEHOT"], F["ONEHOT"], F["DROP"], F["DUMMY"], F["STD"], F["LABEL"], F["DUMMY"]
	],
	"aus_rain": [
		F["DROP"], F["ONEHOT"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["DUMMY"], F["LABEL"]
	],
	"campusrecruitment": [
		F["DROP"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["DUMMY"], F["STD"], F["ONEHOT"], F["STD"], F["LABEL"], F["DROP"]
	],
	"employability": [F["DUMMY"] for i in range(5)] + [F["LABEL"]],
	"fraud": [F["DROP"]] + [F["STD"] for i in range(28)] + [F["STD"], F["LABEL"]],
	"loan": [
		F["DROP"], F["STD"], F["STD"], F["STD"], F["DROP"], F["STD"], F["STD"], F["RAW"], F["STD"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["LABEL"]
	],
	"mobile_price": [
		F["STD"], F["DUMMY"], F["STD"], F["DUMMY"], F["STD"], F["DUMMY"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["LABEL"]
	],
	"stress": [
		F["STD"], F["STD"], F["DUMMY"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["LABEL"]
	],
	"student_testprep": [
		F["DUMMY"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["LABEL"], F["STD"], F["STD"], F["STD"]
	],
	"titanic": [
		F["DROP"], F["LABEL"], F["ONEHOT"], F["DUMMY"], F["STD"], F["RAW"], F["RAW"], F["STD"], F["ONEHOT"]
	],
	"wine": [F["LABEL"]] + [F["STD"] for i in range(13)],
	"drug": [
		F["STD"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["LABEL"]
	],
	"shipping": [
		F["DROP"], F["ONEHOT"], F["ONEHOT"], F["RAW"], F["RAW"], F["STD"], F["RAW"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["STD"], F["LABEL"]
	]
}

assert(len(sets["anemia"]) == 17)
assert(len(sets["drug"]) == 6)
assert(len(sets["campusrecruitment"]) == 15)
assert(len(sets["employability"]) == 6)
assert(len(sets["fraud"]) == 31)
assert(len(sets["loan"]) == 14)
assert(len(sets["mobile_price"]) == 21)
assert(len(sets["shipping"]) == 12)
assert(len(sets["stress"]) == 21)
assert(len(sets["student_testprep"]) == 8)
assert(len(sets["titanic"]) == 9)
assert(len(sets["wine"]) == 14)

assert(len(sets) == 13)

for k in sets:
	print(k)
	c = 0
	for i in sets[k]:
		if i == F["LABEL"]:
			c += 1
	assert(c == 1)


anemia
aus_rain
campusrecruitment
employability
fraud
loan
mobile_price
stress
student_testprep
titanic
wine
drug
shipping


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

def process_dataset(dataset):
	print(f"Processing {dataset}")
	df = pd.read_csv(f"datasets_src/{dataset}/{dataset}.csv")

	print(f"Before dropping columns: {df.shape}")
	# Drop columns
	for i, col in enumerate(df.columns):
		if sets[dataset][i] == F["DROP"]:
			print(f"Dropping {i}: {col}")
			df = df.drop(columns=[col])
	
	print(f"After dropping columns: {df.shape}")

	set = [i for i in sets[dataset] if i != F["DROP"]]

	# Drop NA
	print(f"Before dropping NAs: {df.shape}")
	df = df.dropna()
	print(f"After dropping NAs: {df.shape}")

	# Apply transformations
	for i, col in enumerate(df.columns):
		mode = set[i]

		if mode == F["STD"]:
			print(f"Standardising {i}: {col}")
			df[col] = (df[col] - df[col].mean()) / df[col].std()
		elif mode == F["RAW"]:
			print(f"Leaving {i}: {col} raw")

	label_col = ""
	for i, col in enumerate(df.columns):
		if set[i] == F["LABEL"]:
			label_col = col
			break

	df_caten = df.copy(deep = True)
	for i, col in enumerate(df.columns):
		if set[i] == F["ONEHOT"] or set[i] == F["DUMMY"]:
			print(f"Categorizing {i}: {col}")
			print(f"Before: {df_caten.shape}")
			df_caten = df_caten.drop(columns=[col])
			encoded = pd.get_dummies(df[col], prefix=col[:5])
			df_caten = pd.concat([df_caten, encoded], axis = 1)
			print(f"After: {df_caten.shape}")
			print(f"Categories: {encoded.columns} ({len(encoded.columns)})")


	df = df_caten

	# Encode label & move it down to the end
	print(f"Encoding label {label_col}")
	le = LabelEncoder()
	df[label_col] = le.fit_transform(df[label_col])
	df = df[[c for c in df if c != label_col] + [label_col]]
	

	print(f"Before dropping incidental NAs: {df.shape}")
	df = df.dropna()
	print(f"After dropping incidental NAs: {df.shape}")
	df.to_csv(f"datasets/{dataset}.csv", index=False)

	return df

In [184]:
for i, name in enumerate(sets):
	print(f"Set {i + 1}: {name}")
	process_dataset(name)
	print()

Set 1: anemia
Processing anemia
Before dropping columns: (33924, 17)
Dropping 12: When child put to breast
After dropping columns: (33924, 16)
Before dropping NAs: (33924, 16)
After dropping NAs: (9546, 16)
Leaving 4: Births in last five years raw
Leaving 5: Age of respondent at 1st birth raw
Standardising 6: Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)
Standardising 13: Hemoglobin level adjusted for altitude (g/dl - 1 decimal)
Categorizing 0: Age in 5-year groups
Before: (9546, 16)
After: (9546, 22)
Categories: Index(['Age i_15-19', 'Age i_20-24', 'Age i_25-29', 'Age i_30-34',
       'Age i_35-39', 'Age i_40-44', 'Age i_45-49'],
      dtype='object') (7)
Categorizing 1: Type of place of residence
Before: (9546, 22)
After: (9546, 23)
Categories: Index(['Type _Rural', 'Type _Urban'], dtype='object') (2)
Categorizing 2: Highest educational level
Before: (9546, 23)
After: (9546, 26)
Categories: Index(['Highe_Higher', 'Highe_No education', 'Highe_Primary',
       '

### Get Meta-Features for each Dataset (as defined in $\verb|metafeatures.txt|$)

In [80]:
meta_feats = pd.DataFrame(columns=["datapoints","feats","numeric","avg_corr","n_binary","num_to_cat_ratio","classes","majority_fraction","best_classifier"], index=sets.keys(), data=0.0)

meta_feats

Unnamed: 0,datapoints,feats,numeric,avg_corr,n_binary,num_to_cat_ratio,classes,majority_fraction,best_classifier
anemia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aus_rain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
campusrecruitment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
employability,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fraud,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
loan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mobile_price,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stress,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
student_testprep,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titanic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
# load datasets, the first few cell blocks reload these, wasting time

def load(dataset):
	df = pd.read_csv(f"datasets/{dataset}.csv")
	return df

datasets = {
	k: load(k) for k in sets
}

In [179]:

for name, data in datasets.items():
	meta_feats.loc[name, "datapoints"] = len(data)
	meta_feats.loc[name, "feats"] = len(data.columns) - 1
	X = data.iloc[:, :-1]
	y = data.iloc[:, -1]
	
	num, cat = len(X.select_dtypes(include=np.number).columns), len(X.select_dtypes(include=np.bool_).columns)
	
	meta_feats.loc[name, "numeric"] = num
	meta_feats.loc[name, "n_binary"] = cat
	assert(num + cat == len(X.columns))

	# laplace smoothing
	meta_feats.loc[name, "num_to_cat_ratio"] = num / (cat+1)

	X = X.select_dtypes(include=np.number).to_numpy()

	C = np.corrcoef(X, rowvar=False)

	avg = 0.0
	for i in range(len(C)):
		# ignore diagonal entries
		# symmetric, so only need to look at half
		for j in range(i):
			# doesn't matter whether -ve or +ve linear correlation
			avg += abs(C[i, j])

	# non-diag entries = n^2 - n = n(n-1)
	# symmetric, so divide by 2
	if len(C) > 1:
		avg /= len(C) * (len(C) - 1) / 2
	else:
		avg = 0.0

	meta_feats.loc[name, "avg_corr"] = avg

	meta_feats.loc[name, "classes"] = len(np.unique(y))

	meta_feats.loc[name, "majority_fraction"] = np.max(np.unique(y, return_counts=True)[1]) / len(y)



In [180]:
meta_feats

Unnamed: 0,datapoints,feats,numeric,avg_corr,n_binary,num_to_cat_ratio,classes,majority_fraction,best_classifier
anemia,9546.0,40.0,4.0,0.081748,36.0,0.108108,4.0,0.386235,0.0
aus_rain,56420.0,124.0,16.0,0.286451,108.0,0.146789,2.0,0.779741,0.0
campusrecruitment,215.0,21.0,5.0,0.357938,16.0,0.294118,2.0,0.688372,0.0
employability,829.0,10.0,0.0,0.0,10.0,0.0,2.0,0.687575,0.0
fraud,568630.0,29.0,29.0,0.236221,0.0,29.0,2.0,0.5,0.0
loan,5000.0,15.0,7.0,0.145074,8.0,0.777778,2.0,0.706,0.0
mobile_price,2000.0,26.0,14.0,0.034483,12.0,1.076923,4.0,0.25,0.0
stress,1100.0,21.0,19.0,0.568837,2.0,6.333333,3.0,0.339091,0.0
student_testprep,1000.0,18.0,3.0,0.850244,15.0,0.1875,2.0,0.656,0.0
titanic,889.0,12.0,4.0,0.194706,8.0,0.444444,2.0,0.617548,0.0


### Preprocess this data

I normalise datapoints, feats, numeric, n_binary to the [0, 1] range by rescaling between min/max. I prefer normalisation here due to the abnormal/unpredictable (non-normal) distribution of these features as well as the similar unit/scale (counts)

I leave classes raw, since the range is small.

In [183]:
from sklearn.preprocessing import minmax_scale

meta_feats.iloc[:, [0, 1, 2, 4]] = minmax_scale(meta_feats.iloc[:, [0, 1, 2, 4]])

In [184]:
meta_feats

Unnamed: 0,datapoints,feats,numeric,avg_corr,n_binary,num_to_cat_ratio,classes,majority_fraction,best_classifier
anemia,0.01648,0.269565,0.137931,0.081748,0.333333,0.108108,4.0,0.386235,0.0
aus_rain,0.098939,1.0,0.551724,0.286451,1.0,0.146789,2.0,0.779741,0.0
campusrecruitment,6.5e-05,0.104348,0.172414,0.357938,0.148148,0.294118,2.0,0.688372,0.0
employability,0.001145,0.008696,0.0,0.0,0.092593,0.0,2.0,0.687575,0.0
fraud,1.0,0.173913,1.0,0.236221,0.0,29.0,2.0,0.5,0.0
loan,0.008483,0.052174,0.241379,0.145074,0.074074,0.777778,2.0,0.706,0.0
mobile_price,0.003205,0.147826,0.482759,0.034483,0.111111,1.076923,4.0,0.25,0.0
stress,0.001622,0.104348,0.655172,0.568837,0.018519,6.333333,3.0,0.339091,0.0
student_testprep,0.001446,0.078261,0.103448,0.850244,0.138889,0.1875,2.0,0.656,0.0
titanic,0.001251,0.026087,0.137931,0.194706,0.074074,0.444444,2.0,0.617548,0.0


In [185]:
meta_feats.to_csv("metaset.csv", index_label="dataset")

Best classifier field is filled in after getting model results.

0 -> logi

1 -> knn

2 -> ffn

3 -> dt

4 -> rf

In [202]:
from os.path import exists

if not exists("models.csv"):
	raise Exception("Please run the models notebook first")

models = pd.read_csv("models.csv", index_col="model")
models

Unnamed: 0_level_0,anemia,aus_rain,campusrecruitment,employability,fraud,loan,mobile_price,stress,student_testprep,titanic,wine,drug,shipping
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
logi,0.998428,0.844825,0.860465,0.916787,0.962645,0.7422,0.9735,0.881818,0.762,0.797556,0.988889,0.96,0.594951
knn,0.740413,0.819213,0.827907,0.933662,0.997031,0.7082,0.633,0.87,0.681,0.808786,0.960794,0.9,0.607861
ffn,0.996019,0.77781,0.860394,0.940865,0.999068,0.743601,0.93349,0.882712,0.749015,0.820051,0.977589,0.974898,0.592951
dt,1.0,0.817671,0.813953,0.937276,0.993899,0.745,0.8455,0.889091,0.68,0.814429,0.904762,0.985,0.61786
rf,0.981667,0.802942,0.809302,0.927638,0.999745,0.695,0.793,0.878182,0.663,0.78971,0.955397,0.98,0.620675


In [205]:
for i, col in enumerate(models.columns):
	i = np.argmax(models[col])
	meta_feats.loc[col, "best_classifier"] = i

In [207]:
meta_feats.to_csv("metaset.csv", index_label="dataset")

In [208]:
meta_feats

Unnamed: 0,datapoints,feats,numeric,avg_corr,n_binary,num_to_cat_ratio,classes,majority_fraction,best_classifier
anemia,0.01648,0.269565,0.137931,0.081748,0.333333,0.108108,4.0,0.386235,3.0
aus_rain,0.098939,1.0,0.551724,0.286451,1.0,0.146789,2.0,0.779741,0.0
campusrecruitment,6.5e-05,0.104348,0.172414,0.357938,0.148148,0.294118,2.0,0.688372,0.0
employability,0.001145,0.008696,0.0,0.0,0.092593,0.0,2.0,0.687575,2.0
fraud,1.0,0.173913,1.0,0.236221,0.0,29.0,2.0,0.5,4.0
loan,0.008483,0.052174,0.241379,0.145074,0.074074,0.777778,2.0,0.706,3.0
mobile_price,0.003205,0.147826,0.482759,0.034483,0.111111,1.076923,4.0,0.25,0.0
stress,0.001622,0.104348,0.655172,0.568837,0.018519,6.333333,3.0,0.339091,3.0
student_testprep,0.001446,0.078261,0.103448,0.850244,0.138889,0.1875,2.0,0.656,0.0
titanic,0.001251,0.026087,0.137931,0.194706,0.074074,0.444444,2.0,0.617548,2.0


A couple notes, this is an extremely small dataset, and as mentioned in the proposal, I will most likely not be able to train a robust metaclassifier on this. However, we now move on to trying to accomplish this, nonetheless, in $\verb|meta.ipynb|$.

Also, it is the case here that kNN is not the best classifier for any dataset. This is expected, since the main draw of kNN is not its ability to produce highly accurate models, but the non-existent training cost. Ideally, I would have liked to have at least one example of kNN being the best classifier in the dataset, but I was not able to find such a set. It is fairly trivial to generate a set that kNN performs well on, but I did not choose to pursue this, as I thought it would bias the metaclassifier even further.