In [1]:
import os
import glob
import numpy as np
import mahotas as mh
from mahotas.features import surf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.cluster import MiniBatchKMeans

In [45]:
all_instance_filenames = []
all_instance_targets = []

for f in glob.glob('cats-and-dogs-img/*.jpg'):
    target = 1 if 'cat' in os.path.split(f)[1] else 0
    all_instance_filenames.append(f)
    all_instance_targets.append(target)

surf_features = []
for f in all_instance_filenames:
    image = mh.imread(f, as_grey=True)
    surf_features.append(surf.surf(image)[:, 5:])

train_len = int(len(all_instance_filenames) * .60)
X_train_surf_features = np.concatenate(surf_features[:train_len])
X_test_surf_feautres = np.concatenate(surf_features[train_len:])
y_train = all_instance_targets[:train_len]
y_test = all_instance_targets[train_len:]

In [46]:
n_clusters = 300
estimator = MiniBatchKMeans(n_clusters=n_clusters)
estimator.fit_transform(X_train_surf_features)

array([[ 0.6056733 ,  2.70938102,  1.22470857, ...,  0.40240388,
         1.36376676,  0.91444056],
       [ 1.17256268,  2.15959095,  1.80512123, ...,  1.25544983,
         2.14938607,  0.92937648],
       [ 4.05884662,  1.87604644,  5.28951557, ...,  4.32944494,
         5.41296044,  3.89081466],
       ..., 
       [ 0.6193189 ,  2.92864247,  1.1535589 , ...,  0.36941273,
         1.18161751,  1.09170526],
       [ 1.68619226,  3.95702531,  0.93771461, ...,  1.37208184,
         0.80844426,  2.08232525],
       [ 1.09366926,  1.87174791,  1.99117652, ...,  1.12510896,
         2.15558684,  1.0511277 ]])

In [47]:
X_train = []
for instance in surf_features[:train_len]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters-len(features))))
    X_train.append(features)

X_test = []
for instance in surf_features[train_len:]:
    clusters = estimator.predict(instance)
    features = np.bincount(clusters)
    if len(features) < n_clusters:
        features = np.append(features, np.zeros((1, n_clusters-len(features))))
    X_test.append(features)

In [50]:
clf = LogisticRegression(C=0.001, penalty='l2')
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.69      0.77      0.73       378
          1       0.77      0.69      0.72       420

avg / total       0.73      0.72      0.72       798

