In [115]:
import pandas as pd
import numpy as np

pd.options.display.width = 200

# Import Almond.csv
almond: pd.DataFrame = pd.read_csv('Almond.csv')
almond.columns = ['idx', 'len', 'wid', 'thk', 'area', 'perim', 'round', 'solid', 'compact', 'aspect', 'eccent', 'extent', 'convex', 'type']

#=============== PREPROCESSING ===============#

# Remove useless/redundant cols
almond = almond.drop(['idx', 'round', 'aspect', 'eccent'], axis=1)

# Isolate len, wid, thk cols + convert to numeric
dims: pd.DataFrame = almond[['len', 'wid', 'thk']]

# Convert to numeric
dims = dims.apply(pd.to_numeric, errors='coerce')

# Replace missing values with -1
dims = dims.fillna(-1)

# Create one-hot encoding for missing values
dimsOneHot = dims.copy()
dimsOneHot[dimsOneHot != -1] = 0
dimsOneHot[dimsOneHot == -1] = 1
dimsOneHot.columns = ['len1H', 'wid1H', 'thk1H']

print("-"*50 + " dims " + "-"*50)
print(dims.head(n=20))

print("-"*50 + " dimsOneHot " + "-"*50)
print(dimsOneHot.head(n=20))

# Sort within row + delete first col (empty after sort)
dimsSorted: pd.Series = dims.apply(np.sort, axis=1)
dims = pd.DataFrame(dimsSorted.values.tolist(), index=dimsSorted.index, columns=['null', 'len', 'wid'])
dims = dims[['len', 'wid']]

# Replace len, wid, thk cols in almond with sorted values
almond[['len', 'wid']] = dims
almond.drop('thk', axis=1, inplace=True)

# Insert one-hot encoding for missing values
almond.insert(2, 'thk1H', dimsOneHot['thk1H'])
almond.insert(2, 'wid1H', dimsOneHot['wid1H'])
almond.insert(2, 'len1H', dimsOneHot['len1H'])
almond[['len1H', 'wid1H', 'thk1H']] = dimsOneHot

print("-"*50 + " almond - fully preprocessed " + "-"*50)
print(almond.head(n=20))

-------------------------------------------------- dims --------------------------------------------------
           len         wid         thk
0    -1.000000  227.940628  127.759132
1    -1.000000  234.188126  128.199509
2    -1.000000  229.418610  125.796547
3    -1.000000  232.763153  125.918808
4    -1.000000  230.150742  107.253448
5    -1.000000  231.914429  107.759789
6    -1.000000  226.371048  106.479408
7    -1.000000  226.186142  102.623077
8   413.477173   -1.000000  138.190536
9   418.210327   -1.000000  129.659897
10  423.226959   -1.000000  123.572952
11  416.757172   -1.000000  131.968613
12  372.898560   -1.000000  100.815842
13  364.393768   -1.000000   98.433388
14  360.413147   -1.000000   99.454498
15  362.050964   -1.000000   95.933342
16  315.512085  169.067093   -1.000000
17  318.522736  168.491837   -1.000000
18  320.810455  168.278076   -1.000000
19  317.151794  167.854141   -1.000000
-------------------------------------------------- dimsOneHot ------------