In [16]:
import numpy as np
from matplotlib.image import imread


class ImageDatasetBuilder:
    """
    Given a DataFrame whose index is a set of image IDs (as with {train, test}.csv), returns featurized images.
    """
    def __init__(self, x_dim=100, y_dim=100, source='../data/train/images/', mask=False):
        """
        Builds the featurized image transform.
        
        x_dim: int
            The X dimension to crop the images to.
        y_dim: int
            The Y dimension to crop the images to.
        source: str
            Path to the folder containing the image files.
        mask: booleon
            If true, the underlying data is a mask. If false, the underlying data is RGB. If the data is RGB,
            we take just the R component and skip the GB, because the images are grayscale anyway.
        """
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.source = source
        self.mask = mask
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        r = np.asarray(
            list(
                map(lambda img_id: np.ravel(
                    imread(f'{self.source}/{img_id}.png')[:self.x_dim,:self.y_dim]
                ), X.index.values)
            )
        )
        return r if self.mask else r[:,::3]
    
    
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier


def buildClassifier(dim=50, hidden_nodes=100):
    """Returns the Keras model."""
    def ret():
        clf = Sequential()
        clf.add(Dense(hidden_nodes, activation='relu', input_dim=dim**2))
        clf.add(Dense(dim**2, activation='sigmoid'))
        clf.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])
        return clf
    return ret

TODO: outputting to hugely-dimensioned output target.

In [24]:
# import pandas as pd
# train = pd.read_csv("../data/train.csv", index_col="id", usecols=[0])
# _depths = pd.read_csv("../data/depths.csv", index_col="id")
# train = train.join(_depths)
# test = _depths[~_depths.index.isin(train.index)]
# del _depths

# X = ImageDatasetBuilder(source='../data/train/images/').transform(train)
# y = ImageDatasetBuilder(source='../data/train/masks/', mask=True).transform(train)

In [25]:
# from sklearn.pipeline import make_pipeline

# clf = make_pipeline(
#     KerasClassifier(buildClassifier(dim=100, hidden_nodes=100), epochs=2, batch_size=20)
# )

In [26]:
# clf.fit(X, y)

In [27]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score

# kf = KFold(n_splits=4)

# scores = []
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     scores.append(
#         accuracy_score(clf.fit(X_train, y_train).predict(X_test), y_test)
#     )
    
# np.mean(scores)