In [2]:
import os 
import pickle
import numpy as np
from os.path import join  

datasets = join(os.getcwd(), "datasets")
raw_datasets = join(datasets, "raw")
preprocessed_datasets = join(datasets, "preprocessed")

X_train, y_train, X_test = pickle.load(open(join(raw_datasets, "all_merged_df.pickle"), 'rb'))
X_train_nd, y_train_nd, X_test_nd = pickle.load(open(join(preprocessed_datasets, "cleaned_nd.pickle"), 'rb'))

Remove all *v_* in the last columns
---

In [2]:
X_train.replace(to_replace='v_', value='', inplace=True, regex=True)
X_test.replace(to_replace='v_', value='', inplace=True, regex=True)

Force all features and labels values to *float32*
---

In [3]:
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_test = X_test.astype(np.float32)

print(np.unique(X_train.dtypes.values))
print(np.unique(y_train.dtypes.values))
print(np.unique(X_test.dtypes.values))

[dtype('float32')]
[dtype('float32')]
[dtype('float32')]


Replace all *NaN* values by the median value of that particular column
---

In [4]:
print("> Before")
print("X_train any isna ? ", X_train.isna().values.any())
print("X_test any isna ? ", X_test.isna().values.any())

X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

print("> After")
print("X_train any isna ? ", X_train.isna().values.any())
print("X_test any isna ? ", X_test.isna().values.any())

> Before
X_train any isna ?  True
X_test any isna ?  True
> After
X_train any isna ?  False
X_test any isna ?  False


Dump all as first version of cleaned data 
---

In [5]:
!mkdir -p datasets/preprocessed

pickle.dump((X_train, y_train, X_test), open(join(preprocessed_datasets, "cleaned_df.pickle"), 'wb+'))

Dump as *ndarray*'s
---

In [6]:
X_train_nd, y_train_nd, X_test_nd = X_train.values, y_train.values, X_test.values
pickle.dump((X_train_nd, y_train_nd, X_test_nd), open(join(preprocessed_datasets, "cleaned_nd.pickle"), 'wb+'))

Transform the viewpoint index of the 6 images as categorical data (one-hot encoding)
---

In [3]:
from sklearn.preprocessing import OneHotEncoder

n_viewpoints = 6
train_viewpoints = X_train_nd[:,-n_viewpoints:] 
test_viewpoints = X_test_nd[:,-n_viewpoints:] 

ohe = OneHotEncoder(sparse=False)
ohe.fit(np.r_[train_viewpoints, test_viewpoints].reshape(-1, 1)) # Need all viewpoints as a single column

m_train, m_test, n = len(train_viewpoints), len(test_viewpoints), len(ohe.categories_[0])
train_viewpoints_ohe = np.zeros((m_train, n*n_viewpoints), dtype=train_viewpoints.dtype)
test_viewpoints_ohe = np.zeros((m_test, n*n_viewpoints), dtype=test_viewpoints.dtype)

for vp in range(n_viewpoints):
    train_viewpoints_ohe[:, vp*n:(vp+1)*n] = ohe.transform(train_viewpoints[:, vp, None])
    test_viewpoints_ohe[:, vp*n:(vp+1)*n] = ohe.transform(test_viewpoints[:, vp, None])

X_train_ohe = np.c_[X_train_nd[:, :-n_viewpoints], train_viewpoints_ohe]
X_test_ohe = np.c_[X_test_nd[:, :-n_viewpoints], test_viewpoints_ohe]

Dump the *onehot encoded* array
---

In [11]:
pickle.dump((X_train_ohe, y_train_nd, X_test_ohe), open(join(preprocessed_datasets, "cleaned_ohe.pickle"), 'wb+'))

Transform as *conv output* and *viewpoints* separated 
---

In [16]:
X_train_sep = (X_train_nd[:,:-n_viewpoints], X_train_nd[:, -n_viewpoints:])
X_test_sep = (X_test_nd[:,:-n_viewpoints], X_test_nd[:, -n_viewpoints:])

pickle.dump((X_train_sep, y_train_nd, X_test_sep), open(join(preprocessed_datasets, "cleaned_separated.pickle"), 'wb+'))

Apply TSNE on the *conv output* and on the *onehot encoded viewpoints*
---

In [7]:
from sklearn.manifold import TSNE

tsne_conv = TSNE(5000, method='exact')
tsne_vp = TSNE(5, method='exact')

X_train_tsne_conv = tsne_conv.fit_transform(X_train_ohe[:, :-n_viewpoints])
X_train_tsne_vp = tsne_vp.fit_transform(train_viewpoints_ohe)
X_test_tsne_conv = tsne_conv.fit_transform(X_test_ohe[:, :-n_viewpoints])
X_test_tsne_vp = tsne_vp.fit_transform(test_viewpoints_ohe)

In [8]:
X_train_tsne = np.c_[X_train_tsne_conv, X_train_tsne_vp]
X_test_tsne = np.c_[X_test_tsne_conv, X_test_tsne_vp]

pickle.dump((X_train_tsne, y_train_nd, X_test_tsne), open(join(preprocessed_datasets, "cleaned_tsne.pickle"), "wb+"))

Apply mutual information selection
---

In [None]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

fs = SelectKBest(score_func=mutual_info_classif, k='all')
X_train_mi = fs.fit_transform(X_train_ohe, y_train_nd)
X_test_mi = fs.transform(X_test_ohe)

  return f(*args, **kwargs)


In [None]:
X_train_mi.shape