## Using pre-trained NN

In [None]:
# !conda install -y nomkl > tmp.log

In [1]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import _pickle as pickle
import os
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy.misc import imread, imsave, imresize
from lasagne.utils import floatX

# Model Zoo
* https://github.com/Lasagne/Recipes/tree/master/modelzoo
* More models within the community
* Pick model, copy init, download weights
* Here we proceed with vgg16

In [None]:
# !wget https://s3.amazonaws.com/lasagne/recipes/pretrained/imagenet/vgg16.pkl

In [None]:
# copyright: see http://www.robots.ox.ac.uk/~vgg/research/very_deep/


from lasagne.layers import InputLayer
from lasagne.layers import DenseLayer
from lasagne.layers import NonlinearityLayer
from lasagne.layers import DropoutLayer
from lasagne.layers import Pool2DLayer as PoolLayer
from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.nonlinearities import softmax


def build_model():
    net = {}
    net['input'] = InputLayer((None, 3, 224, 224))
    net['conv1_1'] = ConvLayer(
        net['input'], 64, 3, pad=1, flip_filters=False)
    net['conv1_2'] = ConvLayer(
        net['conv1_1'], 64, 3, pad=1, flip_filters=False)
    net['pool1'] = PoolLayer(net['conv1_2'], 2)
    net['conv2_1'] = ConvLayer(
        net['pool1'], 128, 3, pad=1, flip_filters=False)
    net['conv2_2'] = ConvLayer(
        net['conv2_1'], 128, 3, pad=1, flip_filters=False)
    net['pool2'] = PoolLayer(net['conv2_2'], 2)
    net['conv3_1'] = ConvLayer(
        net['pool2'], 256, 3, pad=1, flip_filters=False)
    net['conv3_2'] = ConvLayer(
        net['conv3_1'], 256, 3, pad=1, flip_filters=False)
    net['conv3_3'] = ConvLayer(
        net['conv3_2'], 256, 3, pad=1, flip_filters=False)
    net['pool3'] = PoolLayer(net['conv3_3'], 2)
    net['conv4_1'] = ConvLayer(
        net['pool3'], 512, 3, pad=1, flip_filters=False)
    net['conv4_2'] = ConvLayer(
        net['conv4_1'], 512, 3, pad=1, flip_filters=False)
    net['conv4_3'] = ConvLayer(
        net['conv4_2'], 512, 3, pad=1, flip_filters=False)
    net['pool4'] = PoolLayer(net['conv4_3'], 2)
    net['conv5_1'] = ConvLayer(
        net['pool4'], 512, 3, pad=1, flip_filters=False)
    net['conv5_2'] = ConvLayer(
        net['conv5_1'], 512, 3, pad=1, flip_filters=False)
    net['conv5_3'] = ConvLayer(
        net['conv5_2'], 512, 3, pad=1, flip_filters=False)
    net['pool5'] = PoolLayer(net['conv5_3'], 2)
    net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
    net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5)
    net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096)
    net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5)
    net['fc8'] = DenseLayer(
        net['fc7_dropout'], num_units=1000, nonlinearity=None)
    net['prob'] = NonlinearityLayer(net['fc8'], softmax)

    return net

In [None]:
#classes' names are stored here
classes = pickle.load(open('classes.pkl', 'rb'))
#for example, 10th class is ostrich:
print(classes[9])

You have to implement two functions in the cell below.

Preprocess function should take the image with shape (w, h, 3) and transform it into a tensor with shape (1, 3, 224, 224). Without this transformation, vgg19 won't be able to digest input image. 
Additionally, your preprocessing function have to rearrange channels RGB -> BGR and subtract mean values from every channel.

In [None]:
MEAN_VALUES = np.array([104, 117, 123])
IMAGE_W = 224

def preprocess(img):
    img = img[:, :, :: -1]
    
    img = img - MEAN_VALUES
    
    #convert from [w,h,3 to 1,3,w,h]
    img = np.transpose(img, (2, 0, 1))[None]
    return img

def deprocess(img):
    img = img.reshape(img.shape[1:]).transpose((1, 2, 0))
    for i in range(3):
        img[:,:, i] += MEAN_VALUES[i]
    return img[:, :, :: -1].astype(np.uint8)

img = (np.random.rand(IMAGE_W, IMAGE_W, 3) * 256).astype(np.uint8)

print(np.linalg.norm(deprocess(preprocess(img)) - img))

If your implementation is correct, the number above will be small, because deprocess function is the inverse of preprocess function

### Deploy the network

In [None]:
net = build_model()

In [None]:
import pickle
with open('vgg16.pkl', 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'latin1'
    weights = u.load()
    
lasagne.layers.set_all_param_values(net['prob'], weights['param values'])

In [None]:
input_image = T.tensor4('input')
output = lasagne.layers.get_output(net['prob'], input_image)
prob = theano.function([input_image], output) 

### Sanity check
Давайте проверим, что загруженная сеть работает. Для этого мы скормим ей картину альбатроса и проверим, что она правильно его распознаёт

In [None]:
img = imread('sample_images/albatross.jpg')
img = img[:, :224, :]
plt.imshow(img)
plt.show()

In [None]:
w = deprocess(preprocess(img))
plt.imshow(w)
plt.show()

In [None]:
p = prob(preprocess(img))

labels = p.ravel().argsort()[-1:-6:-1]
print('top-5 classes are:')
for l in labels:
    print('%3f\t%s' % (p.ravel()[l], classes[l].split(',')[0]))

# Grand-quest: Dogs Vs Cats
* original competition
* https://www.kaggle.com/c/dogs-vs-cats
* 25k JPEG images of various size, 2 classes (guess what)

### Your main objective
* In this seminar your goal is to fine-tune a pre-trained model to distinguish between the two rivaling animals
* The first step is to just reuse some network layer as features

In [None]:
# !wget https://www.dropbox.com/s/d61lupw909hc785/dogs_vs_cats.train.zip?dl=1 -O data.zip
# !unzip data.zip

# for starters
* Train sklearn model, evaluate validation accuracy (should be >80%

In [None]:
# output_features = lasagne.layers.get_output(net['fc6'], input_image)
# extract_features = theano.function([input_image], output_features) 

# #extract features from images
# from tqdm import tqdm
# from scipy.misc import imresize
# import os
# X = []
# Y = []

# #this may be a tedious process. If so, store the results in some pickle and re-use them.
# for fname in tqdm(os.listdir('train/')):
#     y = fname.startswith("cat")
#     img = imread("train/"+fname)
#     img = preprocess(imresize(img,(IMAGE_W,IMAGE_W)))
#     features = extract_features(img)
#     Y.append(y)
#     X.append(features)

In [None]:
def save_result(res, file):
    with open(file, 'wb') as f:
        pickle.dump(obj=res, file=f, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
def load_data(file):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [None]:
# X = np.concatenate(X) #stack all [1xfeature] matrices into one. 
# assert X.ndim==2
# #WARNING! the concatenate works for [1xN] matrices. If you have other format, stack them yourself.

# #crop if we ended prematurely
# Y = Y[:len(X)]

In [None]:
# save_result(X, 'X.pkl')
# save_result(Y, 'Y.pkl')

In [3]:
X = load_data('X.pkl')
Y = load_data('y.pkl')
X.shape, Y.shape

((25000, 4096), (25000,))

In [4]:
idx = np.arange(len(X))
np.random.shuffle(idx)
X = X[idx]
Y = Y[idx]

In [5]:
X_train = X[:15000]
Y_train = Y[:15000]
X_val = X[15000: 20000]
Y_val = Y[15000: 20000]
X_test = X[20000:]
Y_test = Y[20000:]
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape

((15000, 4096), (15000,), (5000, 4096), (5000,), (5000, 4096), (5000,))

__load our dakka__
![img](https://s-media-cache-ak0.pinimg.com/564x/80/a1/81/80a1817a928744a934a7d32e7c03b242.jpg)

In [6]:
import sklearn
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [7]:
#Random Forest
clf_rf = RandomForestClassifier(n_estimators=100)
clf_rf.fit(X_train, Y_train)
pred_rf = clf_rf.predict(X_val)
acc_rf = accuracy_score(Y_val, pred_rf)
acc_rf

0.97999999999999998

In [8]:
#ExtraTree
clf_et = ExtraTreesClassifier(n_estimators=100)
clf_et.fit(X_train, Y_train)
pred_et = clf_et.predict(X_val)
acc_et = accuracy_score(Y_val, pred_et)
acc_et

0.98040000000000005

In [None]:
#Gradient Boosting
clf_gb = GradientBoostingClassifier()
clf_gb.fit(X_train, Y_train)
pred_gb = clf_gb.predict(X_val)
acc_gb = accuracy_score(Y_val, pred_gb)
acc_gb

0.97919999999999996

In [9]:
#AdaBoost
clf_ab = AdaBoostClassifier(n_estimators=100)
clf_ab.fit(X_train, Y_train)
pred_ab = clf_ab.predict(X_val)
acc_ab = accuracy_score(Y_val, pred_ab)
acc_ab

0.97940000000000005

In [7]:
#LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(X_train, Y_train)
pred_lr = clf_lr.predict(X_val)
acc_lr = accuracy_score(Y_val, pred_lr)
acc_lr

0.98560000000000003

In [8]:
#Ridge
clf_rc = RidgeClassifier()
clf_rc.fit(X_train, Y_train)
pred_rc = clf_rc.predict(X_val)
acc_rc = accuracy_score(Y_val, pred_rc)
acc_rc

0.98119999999999996

In [None]:
#SVM
clf_svm = SVC()
clf_svm.fit(X_train, Y_train)
pred_svm = clf_svm.predict(X_val)
acc_svm = accuracy_score(Y_val, pred_svm)
acc_svm

After trying different techniques, I'll use logistic regression, because it has the highest validation accuracy. Now let's see what accuracy will be on test data 

In [10]:
pred_lr = clf_lr.predict(X_test)
accuracy_score(Y_test, pred_lr)

0.98280000000000001

Now let's try neural network

In [6]:
input_X = T.matrix('input features')
target_Y = T.vector('output')

In [89]:
layer = lasagne.layers.InputLayer([None, 4096], input_var=input_X)
layer = lasagne.layers.DenseLayer(layer, 128, nonlinearity=lasagne.nonlinearities.elu)
layer = lasagne.layers.DropoutLayer(layer)
layer = lasagne.layers.DenseLayer(layer, 128, nonlinearity=lasagne.nonlinearities.identity)
layer = lasagne.layers.DropoutLayer(layer)
layer = lasagne.layers.DenseLayer(layer, 64, nonlinearity=lasagne.nonlinearities.tanh)
layer = lasagne.layers.DenseLayer(layer, 1, nonlinearity=lasagne.nonlinearities.sigmoid)
all_weights = lasagne.layers.get_all_params(layer)
print(all_weights)

[W, b, W, b, W, b, W, b]


In [90]:
def iterate_minibatches(inputs, targets, batchsize):
    assert len(inputs) == len(targets)
    indices = np.arange(len(inputs))
    np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        excerpt = indices[start_idx:start_idx + batchsize]
        yield inputs[excerpt], targets[excerpt]

In [91]:
y_predicted = lasagne.layers.get_output(layer, deterministic = False)
y_testing = lasagne.layers.get_output(layer, deterministic = True)

In [92]:
loss = lasagne.objectives.squared_error(y_predicted, target_Y).mean()

accuracy = lasagne.objectives.binary_accuracy(y_testing, target_Y).mean()
updates = lasagne.updates.adadelta(loss, all_weights, learning_rate=1.0)

In [93]:
train_fun = theano.function([input_X, target_Y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_Y], accuracy)

In [94]:
import time

num_epochs = 50 #amount of passes through the data
batch_size = 1000 #number of samples processed at each function call

accuracy_vals = []
test_vals = []
validation_vals = []

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, Y_train, batch_size):
        inputs, targets = batch
        train_err_batch, train_acc_batch= train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_val, Y_val, batch_size):
        inputs, targets = batch
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    
    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))

    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(
        train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(
        val_acc / val_batches * 100))
    
    accuracy_vals.append(train_err / train_batches)
    test_vals.append(train_acc / train_batches * 100)
    validation_vals.append(val_acc / val_batches * 100)

Epoch 1 of 50 took 3.857s
  training loss (in-iteration):		0.055508
  train accuracy:		93.87 %
  validation accuracy:		98.48 %
Epoch 2 of 50 took 3.520s
  training loss (in-iteration):		0.016462
  train accuracy:		98.51 %
  validation accuracy:		98.58 %
Epoch 3 of 50 took 3.338s
  training loss (in-iteration):		0.013160
  train accuracy:		98.63 %
  validation accuracy:		98.68 %
Epoch 4 of 50 took 3.507s
  training loss (in-iteration):		0.012828
  train accuracy:		98.90 %
  validation accuracy:		98.74 %
Epoch 5 of 50 took 3.675s
  training loss (in-iteration):		0.011579
  train accuracy:		98.97 %
  validation accuracy:		98.60 %
Epoch 6 of 50 took 3.842s
  training loss (in-iteration):		0.009981
  train accuracy:		99.10 %
  validation accuracy:		98.80 %
Epoch 7 of 50 took 3.253s
  training loss (in-iteration):		0.009455
  train accuracy:		99.25 %
  validation accuracy:		98.70 %
Epoch 8 of 50 took 3.473s
  training loss (in-iteration):		0.008246
  train accuracy:		99.29 %
  validation acc

In [95]:
accuracy_fun(X_test, Y_test)

array(0.9864)

So we've got 98.64% accuracy. Let's try Cats vs dogs competition's data

In [96]:
test_data = load_data('test.pkl')

In [100]:
predict = theano.function([input_X], y_testing)

In [109]:
pred = predict(test_data)
pred = pred.flatten()

In [153]:
import csv
def save(pred, fname):
    with open(fname,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('id','label'))
        for i in range(len(pred)):
            writer.writerow((i + 1,1 - pred[i]))

In [154]:
save(pred, 'predictions.csv')

#### After submitting to kaggle we've got 0.09768 score

Now let's try to train out network with all of our data and then try again. Maybe with more training examples our network will guess more accuratly

In [168]:
num_epochs = 50 #amount of passes through the data
batch_size = 1000 #number of samples processed at each function call

def train(X, Y):
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X, Y, batch_size):
            inputs, targets = batch
            train_err_batch, train_acc_batch= train_fun(inputs, targets)
            train_err += train_err_batch
            train_acc += train_acc_batch
            train_batches += 1


        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))

        print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
        print("  train accuracy:\t\t{:.2f} %".format(
            train_acc / train_batches * 100))

In [None]:
train(X, Y)

In [113]:
pred = predict(test_data)
pred = pred.flatten()

In [118]:
save(pred, 'predictions.csv')

#### We've got 0.10422 score. So may be we've overfitted the data

In [172]:
layer = lasagne.layers.InputLayer([None, 4096], input_var=input_X)
layer = lasagne.layers.DropoutLayer(layer, p=0.7)
layer = lasagne.layers.DenseLayer(layer, 128, nonlinearity=lasagne.nonlinearities.elu)
layer = lasagne.layers.DropoutLayer(layer, p=0.7)
layer = lasagne.layers.DenseLayer(layer, 128, nonlinearity=lasagne.nonlinearities.elu)
# layer = lasagne.layers.DropoutLayer(layer)
layer = lasagne.layers.DenseLayer(layer, 64, nonlinearity=lasagne.nonlinearities.elu)
# layer = lasagne.layers.DropoutLayer(layer)
layer = lasagne.layers.DenseLayer(layer, 1, nonlinearity=lasagne.nonlinearities.sigmoid)
all_weights = lasagne.layers.get_all_params(layer)
print(all_weights)

y_predicted = lasagne.layers.get_output(layer, deterministic = False)
y_testing = lasagne.layers.get_output(layer, deterministic = True)

loss = lasagne.objectives.squared_error(y_predicted, target_Y).mean()

accuracy = lasagne.objectives.binary_accuracy(y_testing, target_Y).mean()
updates = lasagne.updates.adadelta(loss, all_weights, learning_rate=0.5)

train_fun = theano.function([input_X, target_Y], [loss, accuracy], updates=updates)
accuracy_fun = theano.function([input_X, target_Y], accuracy)

[W, b, W, b, W, b, W, b]


In [173]:
train(X / 100, Y)

Epoch 1 of 50 took 14.895s
  training loss (in-iteration):		0.110726
  train accuracy:		93.30 %
Epoch 2 of 50 took 15.093s
  training loss (in-iteration):		0.033144
  train accuracy:		97.74 %
Epoch 3 of 50 took 15.062s
  training loss (in-iteration):		0.025214
  train accuracy:		98.12 %
Epoch 4 of 50 took 15.066s
  training loss (in-iteration):		0.021070
  train accuracy:		98.32 %
Epoch 5 of 50 took 15.340s
  training loss (in-iteration):		0.019751
  train accuracy:		98.41 %
Epoch 6 of 50 took 14.555s
  training loss (in-iteration):		0.019007
  train accuracy:		98.43 %
Epoch 7 of 50 took 14.578s
  training loss (in-iteration):		0.017879
  train accuracy:		98.48 %
Epoch 8 of 50 took 14.627s
  training loss (in-iteration):		0.017614
  train accuracy:		98.55 %
Epoch 9 of 50 took 14.870s
  training loss (in-iteration):		0.016691
  train accuracy:		98.63 %
Epoch 10 of 50 took 14.577s
  training loss (in-iteration):		0.016184
  train accuracy:		98.60 %
Epoch 11 of 50 took 14.765s
  training 

In [174]:
pred = predict(test_data)
pred = pred.flatten()
save(pred, 'full_4.csv')

# Main quest

* Get the score improved!

No methods are illegal: ensembling, data augmentation, NN hacks. 
Just don't let test data slip into training.

The main requirement is that you implement the NN fine-tuning recipe:
### Split the raw image data
  * please do train/validation/test instead of just train/test
  * reasonable but not optimal split is 20k/2.5k/2.5k or 15k/5k/5k
### Choose which vgg layers are you going to use
  * Anything but for prob is okay
  * Do not forget that vgg16 uses dropout
### Build a few layers on top of chosen "neck" layers.
  * a good idea is to just stack more layers inside the same network
  * alternative: stack on top of get_output
### Train the newly added layers for some iterations
  * you can selectively train some weights by only sending them to your optimizer
      * `lasagne.updates.mysupermegaoptimizer(loss, only_those_weights_i_wanna_train)`
  * selecting all weights from the head but not below the neck:
      * `all_params = lasagne.layers.get_all_params(new_output_layer_or_layers,trainable=True)`
      * `old_params= lasagne.layers.get_all_params(neck_layers,trainable=True)`
      * `new_params = [w for w in all_params if w not in old_params]`
  * it's cruicial to monitor the network performance at this and following steps
### Fine-tune the network body
  * probably a good idea to SAVE your new network weights now 'cuz it's easy to mess things up.
  * Moreover, saving weights periodically is a no-nonsense idea
  * even more cruicial to monitor validation performance
  * main network body may need a separate, much lower learning rate
      * since updates are dictionaries, one can just compute union
      * `updates = {}`
      * `updates.update(lasagne.updates.how_i_optimize_old_weights())`
      * `updates.update(lasagne.updates.how_i_optimize_old_weights())`
      * make sure they do not have overlapping keys. Otherwise, earlier one will be forgotten.
      * `assert len(updates) == len(old_updates) + len(new_updates)`
### PROFIT!!!
  * Evaluate the final score
  * Submit to kaggle
      * competition page https://www.kaggle.com/c/dogs-vs-cats
      * get test data https://www.kaggle.com/c/dogs-vs-cats/data
  
## Some ways to get bonus points
* explore other networks from the model zoo
* play with architecture
* 85%/90%/93%/95%/97% kaggle score (screen pls).
* data augmentation, prediction-time data augmentation
* use any more advanced fine-tuning technique you know/read anywhere
* ml hacks that benefit the final score
