In [1]:
%%bash
# Download ResNet50 pre-trained
#wget https://github.com/leonardvandriel/caffe2_models/raw/master/model/resnet50_init_net.pb
#wget https://github.com/leonardvandriel/caffe2_models/raw/master/model/resnet50_predict_net.pb

In [2]:
init_net_loc = 'resnet50_init_net.pb'
predict_net_loc = 'resnet50_predict_net.pb'

In [3]:
import os
import sys
import numpy as np
import caffe2
from caffe2.proto import caffe2_pb2
from caffe2.python import model_helper, core, workspace, models
from common.params_inf import *
from common.utils import *

In [4]:
# Force one-gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.14.1
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 8.0.61
CuDNN Version  6.0.21


In [6]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCH_SIZE*BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


In [7]:
def load_net(INIT_NET, PREDICT_NET, device_opts):
    init_def = caffe2_pb2.NetDef()
    with open(INIT_NET, 'rb') as f:
        init_def.ParseFromString(f.read())
        init_def.device_option.CopyFrom(device_opts)
        workspace.RunNetOnce(init_def.SerializeToString())
    net_def = caffe2_pb2.NetDef()
    with open(PREDICT_NET, 'rb') as f:
        net_def.ParseFromString(f.read())
        net_def.device_option.CopyFrom(device_opts)
        workspace.CreateNet(net_def.SerializeToString(), overwrite=True)
    return net_def.name

In [8]:
def predict_fn(classifier, data, batchsize, device_opts):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb_X(data, batchsize):
        workspace.FeedBlob("data", dta, device_option=device_opts)
        workspace.RunNet(classifier, 1)
        out[idx*batchsize:(idx+1)*batchsize] = workspace.FetchBlob('pool5').squeeze()
    return out

In [9]:
device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0) 
test_net = load_net(init_net_loc, 
                    predict_net_loc,
                    device_opts=device_opts)

In [10]:
cold_start = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE, device_opts)

In [11]:
%%time
features = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE, device_opts)

CPU times: user 8.28 s, sys: 1.84 s, total: 10.1 s
Wall time: 10.1 s


In [13]:
print("Images per second {}".format((BATCH_SIZE*BATCHES_GPU)/10.1))

Images per second 126.73267326732673
