We begin by cleaning the datasets by dropping null values, selecting our features, and encoding them as required. Further, to maintain a similar structure in all datasets, we move the label (predictor) to the last column.

Prior to this, I've manually cleaned out the NAs from the aus_rain dataset (replacing them with nulls) and unwrapped the quotes(") from around each value.

In [69]:
! rmdir /S /Q datasets
! mkdir datasets

The system cannot find the file specified.


In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [174]:
F = {
	"STD": 0,
	"DROP": 1,
	"LABEL": 2,
	"ONEHOT": 3,
	"DUMMY": 4,
	"RAW": 5,
}

sets = {
	"anemia": [
		F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["RAW"], F["RAW"], F["STD"], F["ONEHOT"], F["DUMMY"], F["DUMMY"], F["ONEHOT"], F["ONEHOT"], F["DROP"], F["DUMMY"], F["STD"], F["LABEL"], F["DUMMY"]
	],
	"aus_rain": [
		F["DROP"], F["ONEHOT"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["DUMMY"], F["LABEL"]
	],
	"campusrecruitment": [
		F["DROP"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["ONEHOT"], F["DUMMY"], F["STD"], F["ONEHOT"], F["STD"], F["LABEL"], F["DROP"]
	],
	"employability": [F["DUMMY"] for i in range(5)] + [F["LABEL"]],
	"fraud": [F["DROP"]] + [F["STD"] for i in range(28)] + [F["STD"], F["LABEL"]],
	"loan": [
		F["DROP"], F["STD"], F["STD"], F["STD"], F["DROP"], F["STD"], F["STD"], F["RAW"], F["STD"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["LABEL"]
	],
	"mobile_price": [
		F["STD"], F["DUMMY"], F["STD"], F["DUMMY"], F["STD"], F["DUMMY"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["DUMMY"], F["DUMMY"], F["DUMMY"], F["LABEL"]
	],
	"stress": [
		F["STD"], F["STD"], F["DUMMY"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["STD"], F["LABEL"]
	],
	"student_testprep": [
		F["DUMMY"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["LABEL"], F["STD"], F["STD"], F["STD"]
	],
	"titanic": [
		F["DROP"], F["LABEL"], F["ONEHOT"], F["DUMMY"], F["STD"], F["RAW"], F["RAW"], F["STD"], F["ONEHOT"]
	],
	"wine": [F["LABEL"]] + [F["STD"] for i in range(13)],
	"drug": [
		F["STD"], F["ONEHOT"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["LABEL"]
	],
	"shipping": [
		F["DROP"], F["ONEHOT"], F["ONEHOT"], F["RAW"], F["RAW"], F["STD"], F["RAW"], F["ONEHOT"], F["ONEHOT"], F["STD"], F["STD"], F["LABEL"]
	]
}

assert(len(sets["anemia"]) == 17)
assert(len(sets["drug"]) == 6)
assert(len(sets["campusrecruitment"]) == 15)
assert(len(sets["employability"]) == 6)
assert(len(sets["fraud"]) == 31)
assert(len(sets["loan"]) == 14)
assert(len(sets["mobile_price"]) == 21)
assert(len(sets["shipping"]) == 12)
assert(len(sets["stress"]) == 21)
assert(len(sets["student_testprep"]) == 8)
assert(len(sets["titanic"]) == 9)
assert(len(sets["wine"]) == 14)

assert(len(sets) == 13)

for k in sets:
	print(k)
	c = 0
	for i in sets[k]:
		if i == F["LABEL"]:
			c += 1
	assert(c == 1)


anemia
aus_rain
campusrecruitment
employability
fraud
loan
mobile_price
stress
student_testprep
titanic
wine
drug
shipping


In [176]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

def process_dataset(dataset):
	print(f"Processing {dataset}")
	df = pd.read_csv(f"datasets_src/{dataset}/{dataset}.csv")

	print(f"Before dropping columns: {df.shape}")
	# Drop columns
	for i, col in enumerate(df.columns):
		if sets[dataset][i] == F["DROP"]:
			print(f"Dropping {i}: {col}")
			df = df.drop(columns=[col])
	
	print(f"After dropping columns: {df.shape}")

	set = [i for i in sets[dataset] if i != F["DROP"]]

	# Drop NA
	print(f"Before dropping NAs: {df.shape}")
	df = df.dropna()
	print(f"After dropping NAs: {df.shape}")

	# Apply transformations
	for i, col in enumerate(df.columns):
		mode = set[i]

		if mode == F["STD"]:
			print(f"Standardising {i}: {col}")
			df[col] = (df[col] - df[col].mean()) / df[col].std()
		elif mode == F["RAW"]:
			print(f"Leaving {i}: {col} raw")

	label_col = ""
	for i, col in enumerate(df.columns):
		if set[i] == F["LABEL"]:
			label_col = col
			break

	df_caten = df.copy(deep = True)
	for i, col in enumerate(df.columns):
		if set[i] == F["ONEHOT"] or set[i] == F["DUMMY"]:
			print(f"Categorizing {i}: {col}")
			print(f"Before: {df_caten.shape}")
			df_caten = df_caten.drop(columns=[col])
			encoded = pd.get_dummies(df[col], prefix=col[:5])
			df_caten = pd.concat([df_caten, encoded], axis = 1)
			print(f"After: {df_caten.shape}")
			print(f"Categories: {encoded.columns} ({len(encoded.columns)})")


	df = df_caten

	# Encode label & move it down to the end
	print(f"Encoding label {label_col}")
	le = LabelEncoder()
	df[label_col] = le.fit_transform(df[label_col])
	df = df[[c for c in df if c != label_col] + [label_col]]
	

	print(f"Before dropping incidental NAs: {df.shape}")
	df = df.dropna()
	print(f"After dropping incidental NAs: {df.shape}")
	df.to_csv(f"datasets/{dataset}.csv", index=False)

	return df

In [184]:
for i, name in enumerate(sets):
	print(f"Set {i + 1}: {name}")
	process_dataset(name)
	print()

Set 1: anemia
Processing anemia
Before dropping columns: (33924, 17)
Dropping 12: When child put to breast
After dropping columns: (33924, 16)
Before dropping NAs: (33924, 16)
After dropping NAs: (9546, 16)
Leaving 4: Births in last five years raw
Leaving 5: Age of respondent at 1st birth raw
Standardising 6: Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)
Standardising 13: Hemoglobin level adjusted for altitude (g/dl - 1 decimal)
Categorizing 0: Age in 5-year groups
Before: (9546, 16)
After: (9546, 22)
Categories: Index(['Age i_15-19', 'Age i_20-24', 'Age i_25-29', 'Age i_30-34',
       'Age i_35-39', 'Age i_40-44', 'Age i_45-49'],
      dtype='object') (7)
Categorizing 1: Type of place of residence
Before: (9546, 22)
After: (9546, 23)
Categories: Index(['Type _Rural', 'Type _Urban'], dtype='object') (2)
Categorizing 2: Highest educational level
Before: (9546, 23)
After: (9546, 26)
Categories: Index(['Highe_Higher', 'Highe_No education', 'Highe_Primary',
       '