# Machine Learning: Cats vs. Dogs

In [1]:
import os

import numpy as np
from scipy.misc import imread, imshow, imresize

## Prepare the datasets

In [2]:
def load_dataset(max_files=float("inf")):
    X = []
    y = []

    # Load images
    counter = [0, 0]
    for _file in os.listdir("data/train"):
        category = 1 if "cat" in _file else 0
        # Leading all 20k images takes a long time. For testing purposes, we want to reduce the number of examples.
        if counter[category] < max_files:
            img = imread(os.path.join("data/train", _file), flatten=True)
            img = imresize(img, size=(128,128)).flatten()
            X.append(img)
            y.append(category)
            counter[category] += 1
    return np.stack(X), np.array(y)


def split_dataset(X, y):
    X_num = X.shape[0]
    indicies = np.random.permutation(X_num)
    split_idx = int(X_num * 0.8)
    X_data, y_data = X[indicies[:split_idx]], y[indicies[:split_idx]]
    X_test, y_test = X[indicies[split_idx:]], y[indicies[split_idx:]]

    return X_data, y_data, X_test, y_test

In [3]:
# Create the dataset. This may take a while...
X, y = load_dataset(max_files=300)

# Split the dataset into training and test
X_train, y_train, X_test, y_test = split_dataset(X, y)

## Training

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [5]:
classifiers = [KNeighborsClassifier, SVC, DecisionTreeClassifier]

for clf_class in classifiers:
    clf = clf_class()
    clf.fit(X_train, y_train)
    result = clf.score(X_test, y_test)
    print "Classifier: {cls:25}Score: {score}".format(cls=clf.__class__.__name__, score=result)

Classifier: KNeighborsClassifier     Score: 0.575
Classifier: SVC                      Score: 0.483333333333
Classifier: DecisionTreeClassifier   Score: 0.45
