Large diffs are not rendered by default.

@@ -0,0 +1,56 @@
# Stupid python path shit.
# Instead just add darknet.py to somewhere in your python path
# OK actually that might not be a great idea, idk, work in progress
# Use at your own risk. or don't, i don't care

from scipy.misc import imread
import cv2

def array_to_image(arr):
arr = arr.transpose(2,0,1)
c = arr.shape[0]
h = arr.shape[1]
w = arr.shape[2]
arr = (arr/255.0).flatten()
data = dn.c_array(dn.c_float, arr)
im = dn.IMAGE(w,h,c,data)
return im

def detect2(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
boxes = dn.make_boxes(net)
probs = dn.make_probs(net)
num = dn.num_boxes(net)
dn.network_detect(net, image, thresh, hier_thresh, nms, boxes, probs)
res = []
for j in range(num):
for i in range(meta.classes):
if probs[j][i] > 0:
res.append((meta.names[i], probs[j][i], (boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h)))
res = sorted(res, key=lambda x: -x[1])
dn.free_ptrs(dn.cast(probs, dn.POINTER(dn.c_void_p)), num)
return res

import sys, os
sys.path.append(os.path.join(os.getcwd(),'python/'))

import darknet as dn

# Darknet
net = dn.load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
meta = dn.load_meta("cfg/coco.data")
r = dn.detect(net, meta, "data/dog.jpg")
print r

# scipy
arr= imread('data/dog.jpg')
im = array_to_image(arr)
r = detect2(net, meta, im)
print r

# OpenCV
arr = cv2.imread('data/dog.jpg')
im = array_to_image(arr)
dn.rgbgr_image(im)
r = detect2(net, meta, im)
print r

Large diffs are not rendered by default.

@@ -0,0 +1,27 @@
# Stupid python path shit.
# Instead just add darknet.py to somewhere in your python path
# OK actually that might not be a great idea, idk, work in progress
# Use at your own risk. or don't, i don't care

import sys, os
sys.path.append(os.path.join(os.getcwd(),'python/'))

import darknet as dn
import pdb

dn.set_gpu(0)
net = dn.load_net("cfg/yolo-thor.cfg", "/home/pjreddie/backup/yolo-thor_final.weights", 0)
meta = dn.load_meta("cfg/thor.data")
r = dn.detect(net, meta, "data/bedroom.jpg")
print r

# And then down here you could detect a lot more images like:
r = dn.detect(net, meta, "data/eagle.jpg")
print r
r = dn.detect(net, meta, "data/giraffe.jpg")
print r
r = dn.detect(net, meta, "data/horses.jpg")
print r
r = dn.detect(net, meta, "data/person.jpg")
print r

@@ -0,0 +1,116 @@
#include "darknet.h"

char *dice_labels[] = {"face1","face2","face3","face4","face5","face6"};

void train_dice(char *cfgfile, char *weightfile)
{
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
char *backup_directory = "/home/pjreddie/backup/";
printf("%s\n", base);
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int imgs = 1024;
int i = *net->seen/imgs;
char **labels = dice_labels;
list *plist = get_paths("data/dice/dice.train.list");
char **paths = (char **)list_to_array(plist);
printf("%d\n", plist->size);
clock_t time;
while(1){
++i;
time=clock();
data train = load_data_old(paths, imgs, plist->size, labels, 6, net->w, net->h);
printf("Loaded: %lf seconds\n", sec(clock()-time));

time=clock();
float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net->seen);
free_data(train);
if((i % 100) == 0) net->learning_rate *= .1;
if(i%100==0){
char buff[256];
sprintf(buff, "%s/%s_%d.weights",backup_directory,base, i);
save_weights(net, buff);
}
}
}

void validate_dice(char *filename, char *weightfile)
{
network *net = parse_network_cfg(filename);
if(weightfile){
load_weights(net, weightfile);
}
srand(time(0));

char **labels = dice_labels;
list *plist = get_paths("data/dice/dice.val.list");

char **paths = (char **)list_to_array(plist);
int m = plist->size;
free_list(plist);

data val = load_data_old(paths, m, 0, labels, 6, net->w, net->h);
float *acc = network_accuracies(net, val, 2);
printf("Validation Accuracy: %f, %d images\n", acc[0], m);
free_data(val);
}

void test_dice(char *cfgfile, char *weightfile, char *filename)
{
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
set_batch_network(net, 1);
srand(2222222);
int i = 0;
char **names = dice_labels;
char buff[256];
char *input = buff;
int indexes[6];
while(1){
if(filename){
strncpy(input, filename, 256);
}else{
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
image im = load_image_color(input, net->w, net->h);
float *X = im.data;
float *predictions = network_predict(net, X);
top_predictions(net, 6, indexes);
for(i = 0; i < 6; ++i){
int index = indexes[i];
printf("%s: %f\n", names[index], predictions[index]);
}
free_image(im);
if (filename) break;
}
}

void run_dice(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5]: 0;
if(0==strcmp(argv[2], "test")) test_dice(cfg, weights, filename);
else if(0==strcmp(argv[2], "train")) train_dice(cfg, weights);
else if(0==strcmp(argv[2], "valid")) validate_dice(cfg, weights);
}

Large diffs are not rendered by default.

@@ -523,7 +523,7 @@ void set_network_alpha_beta(network *net, float alpha, float beta)
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
{
#ifdef GPU
const char *backup_directory = "/home/pjreddie/backup/";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfg);
char *abase = basecfg(acfg);
@@ -551,36 +551,36 @@ void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, i
args.type = CLASSIFICATION_DATA;
args.threads=16;
args.classes = 1;
char *ls[2] = {const_cast<char*>("imagenet"), const_cast<char*>("zzzzzzzz")};
char *ls[2] = {"imagenet", "zzzzzzzz"};
args.labels = ls;

std::thread load_thread = load_data_in_thread(args);
pthread_t load_thread = load_data_in_thread(args);
clock_t time;

gnet->train = 1;
anet->train = 1;

int x_size = gnet->inputs*gnet->batch;
int y_size = gnet->truths*gnet->batch;
CLArray imerror = cl_make_array(0, y_size);
float *imerror = cuda_make_array(0, y_size);

float aloss_avg = -1;

if (maxbatch == 0) maxbatch = gnet->max_batches;
while (get_current_batch(gnet) < maxbatch) {
{
int cb = get_current_batch(gnet);
int cb = get_current_batch(gnet);
float alpha = (float) cb / (maxbatch/2);
if(alpha > 1) alpha = 1;
float beta = 1 - alpha;
printf("%f %f\n", alpha, beta);
printf("%f %f\n", alpha, beta);
set_network_alpha_beta(gnet, alpha, beta);
set_network_alpha_beta(anet, beta, alpha);
}

i += 1;
time=clock();
//pthread_join(load_thread, 0);
load_thread.join();
pthread_join(load_thread, 0);
train = buffer;

load_thread = load_data_in_thread(args);
@@ -591,7 +591,7 @@ void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, i
for (j = 0; j < imgs; ++j) {
train.y.vals[j][0] = 1;
gen.y.vals[j][0] = 0;
}
}
time=clock();

for (j = 0; j < gnet->subdivisions; ++j) {
@@ -626,14 +626,15 @@ void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, i
backward_network(gnet);

for(k = 0; k < gnet->batch; ++k){
int index = j*gnet->batch + k;
int index = j*gnet->batch + k;
copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
}
}
harmless_update_network_gpu(anet);

data merge = concat_data(train, gen);
float aloss = train_network(anet, merge);

#ifdef OPENCV
if(display){
image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
@@ -679,7 +680,7 @@ void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, i
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
{
#ifdef GPU
const char *backup_directory = "/home/pjreddie/backup/";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfg);
char *abase = basecfg(acfg);
@@ -689,8 +690,7 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
//float orig_rate = anet->learning_rate;

int i, j, k;
layer imlayer;
memset(&imlayer,0,sizeof(layer));
layer imlayer = {0};
for (i = 0; i < gnet->n; ++i) {
if (gnet->layers[i].out_c == 3) {
imlayer = gnet->layers[i];
@@ -716,18 +716,18 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
args.type = CLASSIFICATION_DATA;
args.threads=16;
args.classes = 1;
const char *ls[2] = {"imagenet", "zzzzzzzz"};
args.labels = const_cast<char**>(ls);
char *ls[2] = {"imagenet", "zzzzzzzz"};
args.labels = ls;

std::thread load_thread = load_data_in_thread(args);
pthread_t load_thread = load_data_in_thread(args);
clock_t time;

gnet->train = 1;
anet->train = 1;

int x_size = gnet->inputs*gnet->batch;
int y_size = gnet->truths*gnet->batch;
CLArray imerror = cl_make_array(0, y_size);
float *imerror = cuda_make_array(0, y_size);

//int ay_size = anet->truths*anet->batch;

@@ -739,8 +739,7 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
while (get_current_batch(gnet) < maxbatch) {
i += 1;
time=clock();

load_thread.join();
pthread_join(load_thread, 0);
train = buffer;

//translate_data_rows(train, -.5);
@@ -767,12 +766,18 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
}
/*
for(z = 0; z < 100; ++z){
printf("%f, ", gnet->input[z]);
}
printf("\n");
printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size));
*/

//cl_push_array(gnet->input_gpu, gnet->input, x_size);
//cl_push_array(gnet->truth_gpu, gnet->truth, y_size);
//cuda_push_array(gnet->input_gpu, gnet->input, x_size);
//cuda_push_array(gnet->truth_gpu, gnet->truth, y_size);
*gnet->seen += gnet->batch;
//forward_network_gpu(gnet);
forward_network(gnet);
forward_network(gnet);

fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
@@ -787,10 +792,21 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);

//printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
//printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch));

axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);

backward_network(gnet);

/*
for(k = 0; k < gnet->n; ++k){
layer l = gnet->layers[k];
cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch));
}
*/

for(k = 0; k < gnet->batch; ++k){
int index = j*gnet->batch + k;
copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
@@ -850,8 +866,6 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
save_weights(anet, buff);
}
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(gnet, buff);
@@ -863,8 +877,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
#ifdef GPU
//char *train_images = "/home/pjreddie/data/coco/train1.txt";
//char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
const char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
const char *backup_directory = "/home/pjreddie/backup/";
char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfg);
char *abase = basecfg(acfg);
@@ -873,8 +887,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
network *anet = load_network(acfg, aweight, clear);

int i, j, k;
layer imlayer;
memset(&imlayer,0,sizeof(layer));
layer imlayer = {0};
for (i = 0; i < net->n; ++i) {
if (net->layers[i].out_c == 3) {
imlayer = net->layers[i];
@@ -900,25 +913,25 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle

args.type = CLASSIFICATION_DATA;
args.classes = 1;
const char *ls[2] = {"imagenet"};
args.labels = const_cast<char**>(ls);
char *ls[2] = {"imagenet"};
args.labels = ls;

std::thread load_thread = load_data_in_thread(args);
pthread_t load_thread = load_data_in_thread(args);
clock_t time;

int x_size = net->inputs*net->batch;
//int y_size = x_size;
net->delta = 0;
net->train = 1;
float *pixs = (float *)calloc(x_size, sizeof(float));
float *graypixs = (float *)calloc(x_size, sizeof(float));
float *pixs = calloc(x_size, sizeof(float));
float *graypixs = calloc(x_size, sizeof(float));
//float *y = calloc(y_size, sizeof(float));

//int ay_size = anet->outputs*anet->batch;
anet->delta = 0;
anet->train = 1;

CLArray imerror = cl_make_array(0, imlayer.outputs*imlayer.batch);
float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch);

float aloss_avg = -1;
float gloss_avg = -1;
@@ -928,8 +941,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
while (get_current_batch(net) < net->max_batches) {
i += 1;
time=clock();

load_thread.join();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

@@ -948,8 +960,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
for(j = 0; j < net->subdivisions; ++j){
get_next_batch(train, net->batch, j*net->batch, pixs, 0);
get_next_batch(gray, net->batch, j*net->batch, graypixs, 0);
cl_push_array(net->input_gpu, graypixs, net->inputs*net->batch);
cl_push_array(net->truth_gpu, pixs, net->truths*net->batch);
cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch);
cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch);
/*
image origi = float_to_image(net->w, net->h, 3, pixs);
image grayi = float_to_image(net->w, net->h, 3, graypixs);
@@ -971,8 +983,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle

scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);

printf("realness %f\n", cl_mag_array(imerror, imlayer.outputs*imlayer.batch));
printf("features %f\n", cl_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch));
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch));

axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1);

@@ -1026,8 +1038,6 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
save_weights(anet, buff);
}
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
@@ -46,21 +46,21 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
image delta = make_image(im.w, im.h, im.c);

#ifdef GPU
net->delta_gpu = cl_make_array(delta.data, im.w*im.h*im.c);
net->delta_gpu = cuda_make_array(delta.data, im.w*im.h*im.c);
copy_cpu(net->inputs, im.data, 1, net->input, 1);

forward_network_gpu(net);
copy_gpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);

cl_pull_array(last.delta_gpu, last.delta, last.outputs);
cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
calculate_loss(last.delta, last.delta, last.outputs, thresh);
cl_push_array(last.delta_gpu, last.delta, last.outputs);
cuda_push_array(last.delta_gpu, last.delta, last.outputs);

backward_network_gpu(net);

cl_pull_array(net->delta_gpu, delta.data, im.w*im.h*im.c);
cl_free(net->delta_gpu);
net->delta_gpu = CLArray();
cuda_pull_array(net->delta_gpu, delta.data, im.w*im.h*im.c);
cuda_free(net->delta_gpu);
net->delta_gpu = 0;
#else
printf("\nnet: %d %d %d im: %d %d %d\n", net->w, net->h, net->inputs, im.w, im.h, im.c);
copy_cpu(net->inputs, im.data, 1, net->input, 1);
@@ -140,18 +140,18 @@ void reconstruct_picture(network *net, float *features, image recon, image updat

#ifdef GPU
layer l = get_network_output_layer(net);
cl_push_array(net->input_gpu, recon.data, recon.w*recon.h*recon.c);
cuda_push_array(net->input_gpu, recon.data, recon.w*recon.h*recon.c);
//cuda_push_array(net->truth_gpu, features, net->truths);
net->delta_gpu = cl_make_array(delta.data, delta.w*delta.h*delta.c);
net->delta_gpu = cuda_make_array(delta.data, delta.w*delta.h*delta.c);

forward_network_gpu(net);
cl_push_array(l.delta_gpu, features, l.outputs);
cuda_push_array(l.delta_gpu, features, l.outputs);
axpy_gpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
backward_network_gpu(net);

cl_pull_array(net->delta_gpu, delta.data, delta.w*delta.h*delta.c);
cuda_pull_array(net->delta_gpu, delta.data, delta.w*delta.h*delta.c);

cl_free(net->delta_gpu);
cuda_free(net->delta_gpu);
#else
net->input = recon.data;
net->delta = delta.data;
@@ -1,5 +1,5 @@
#include "darknet.h"
//#include <sys/time.h>
#include <sys/time.h>
#include <assert.h>

void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
@@ -10,14 +10,14 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
char *base = basecfg(cfgfile);
printf("%s\n", base);
printf("%d\n", ngpus);
network **nets = (network **)calloc(ngpus, sizeof(network*));
network **nets = calloc(ngpus, sizeof(network*));

srand(time(0));
int seed = rand();
for(i = 0; i < ngpus; ++i){
srand(seed);
#ifdef GPU
cl_set_device(gpus[i]);
cuda_set_device(gpus[i]);
#endif
nets[i] = load_network(cfgfile, weightfile, clear);
nets[i]->learning_rate *= ngpus;
@@ -30,8 +30,8 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
list *options = read_data_cfg(datacfg);

const char *backup_directory = option_find_str(options, "backup", "/backup/");
const char *train_list = option_find_str(options, "train", "data/train.list");
char *backup_directory = option_find_str(options, "backup", "/backup/");
char *train_list = option_find_str(options, "train", "data/train.list");
int classes = option_find_int(options, "classes", 1);

list *plist = get_paths(train_list);
@@ -62,15 +62,15 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,

data train;
data buffer;
std::thread load_thread;
pthread_t load_thread;
args.d = &buffer;
load_thread = load_data(args);

int epoch = (*net->seen)/N;
while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
time=clock();

load_thread.join();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data(args);

@@ -103,14 +103,12 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
save_weights(net, buff);
}
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s.weights", backup_directory, base);
save_weights(net, buff);

free_network(net);
free_ptrs((char**)paths, plist->size);
free_ptrs((void**)paths, plist->size);
free_list(plist);
free(base);
}
@@ -166,7 +164,7 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
}
list *options = read_data_cfg(datacfg);
int classes = option_find_int(options, "classes", 1);
const char *name_list = option_find_str(options, "names", 0);
char *name_list = option_find_str(options, "names", 0);
char **names = get_labels(name_list);

if(!cap) error("Couldn't connect to webcam.\n");
@@ -175,8 +173,8 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
float fps = 0;

while(1){
std::chrono::steady_clock::time_point start, end;
start = what_time_is_it_now();
struct timeval tval_before, tval_after, tval_result;
gettimeofday(&tval_before, NULL);

image in = get_image_from_stream(cap);
image crop = center_crop_image(in, net->w, net->h);
@@ -199,9 +197,10 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde

cvWaitKey(10);

end = what_time_is_it_now();
float curr = 1.0f / sec(end, start);
fps = .9*fps + .1*curr;
gettimeofday(&tval_after, NULL);
timersub(&tval_after, &tval_before, &tval_result);
float curr = 1000000.f/((long int)tval_result.tv_usec);
fps = .9*fps + .1*curr;
}
#endif
}
@@ -226,7 +225,7 @@ void run_regressor(int argc, char **argv)
for(i = 0; i < len; ++i){
if (gpu_list[i] == ',') ++ngpus;
}
gpus = (int*)calloc(ngpus, sizeof(int));
gpus = calloc(ngpus, sizeof(int));
for(i = 0; i < ngpus; ++i){
gpus[i] = atoi(gpu_list);
gpu_list = strchr(gpu_list, ',')+1;
@@ -11,7 +11,7 @@ unsigned char **load_files(char *filename, int *n)
{
list *paths = get_paths(filename);
*n = paths->size;
unsigned char **contents = (unsigned char **)calloc(*n, sizeof(char *));
unsigned char **contents = calloc(*n, sizeof(char *));
int i;
node *x = paths->front;
for(i = 0; i < *n; ++i){
@@ -26,20 +26,20 @@ int *read_tokenized_data(char *filename, size_t *read)
size_t size = 512;
size_t count = 0;
FILE *fp = fopen(filename, "r");
int *d = (int*)calloc(size, sizeof(int));
int *d = calloc(size, sizeof(int));
int n, one;
one = fscanf(fp, "%d", &n);
while(one == 1){
++count;
if(count > size){
size = size*2;
d = (int*)realloc(d, size*sizeof(int));
d = realloc(d, size*sizeof(int));
}
d[count-1] = n;
one = fscanf(fp, "%d", &n);
}
fclose(fp);
d = (int*)realloc(d, count*sizeof(int));
d = realloc(d, count*sizeof(int));
*read = count;
return d;
}
@@ -49,28 +49,28 @@ char **read_tokens(char *filename, size_t *read)
size_t size = 512;
size_t count = 0;
FILE *fp = fopen(filename, "r");
char **d = (char**)calloc(size, sizeof(char *));
const char *line;
char **d = calloc(size, sizeof(char *));
char *line;
while((line=fgetl(fp)) != 0){
++count;
if(count > size){
size = size*2;
d = (char**)realloc(d, size*sizeof(char *));
d = realloc(d, size*sizeof(char *));
}
if(0==strcmp(line, "<NEWLINE>")) line = "\n";
d[count-1] = const_cast<char*>(line);
d[count-1] = line;
}
fclose(fp);
d = (char**)realloc(d, count*sizeof(char *));
d = realloc(d, count*sizeof(char *));
*read = count;
return d;
}


float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
{
float *x = (float *)calloc(batch * steps * characters, sizeof(float));
float *y = (float *)calloc(batch * steps * characters, sizeof(float));
float *x = calloc(batch * steps * characters, sizeof(float));
float *y = calloc(batch * steps * characters, sizeof(float));
int i,j;
for(i = 0; i < batch; ++i){
for(j = 0; j < steps; ++j){
@@ -96,8 +96,8 @@ float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size
float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, size_t len, int batch, int steps)
{
int i,j;
float *x = (float *)calloc(batch * steps * characters, sizeof(float));
float *y = (float *)calloc(batch * steps * characters, sizeof(float));
float *x = calloc(batch * steps * characters, sizeof(float));
float *y = calloc(batch * steps * characters, sizeof(float));
for(i = 0; i < batch; ++i){
int index = rand()%n;
//int slen = strlen(source[index]);
@@ -126,8 +126,8 @@ float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, s

float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
{
float *x = (float *)calloc(batch * steps * characters, sizeof(float));
float *y = (float *)calloc(batch * steps * characters, sizeof(float));
float *x = calloc(batch * steps * characters, sizeof(float));
float *y = calloc(batch * steps * characters, sizeof(float));
int i,j;
for(i = 0; i < batch; ++i){
for(j = 0; j < steps; ++j){
@@ -167,7 +167,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
size = strlen((const char*)text);
}

const char *backup_directory = "/home/pjreddie/backup/";
char *backup_directory = "/home/pjreddie/backup/";
char *base = basecfg(cfgfile);
fprintf(stderr, "%s\n", base);
float avg_loss = -1;
@@ -181,7 +181,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
int i = (*net->seen)/net->batch;

int streams = batch/steps;
size_t *offsets = (size_t *)calloc(streams, sizeof(size_t));
size_t *offsets = calloc(streams, sizeof(size_t));
int j;
for(j = 0; j < streams; ++j){
offsets[j] = rand_size_t()%size;
@@ -261,7 +261,7 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t
for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
int c = 0;
int len = strlen(seed);
float *input = (float*)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));

/*
fill_cpu(inputs, 0, input, 1);
@@ -314,7 +314,7 @@ void test_tactic_rnn_multi(char *cfgfile, char *weightfile, int num, float temp,
int i, j;
for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
int c = 0;
float *input = (float *)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));
float *out = 0;

while(1){
@@ -359,7 +359,7 @@ void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int r
int i, j;
for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
int c = 0;
float *input = (float *)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));
float *out = 0;

while((c = getc(stdin)) != EOF){
@@ -395,7 +395,7 @@ void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed)
int words = 1;
int c;
int len = strlen(seed);
float *input = (float *)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));
int i;
for(i = 0; i < len; ++i){
c = seed[i];
@@ -444,7 +444,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
int words = 1;
int c;
int len = strlen(seed);
float *input = (float *)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));
int i;
for(i = 0; i < len; ++i){
c = seed[i];
@@ -480,7 +480,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)

int c;
int seed_len = strlen(seed);
float *input = (float *)calloc(inputs, sizeof(float));
float *input = calloc(inputs, sizeof(float));
int i;
char *line;
while((line=fgetl(stdin)) != 0){
@@ -506,7 +506,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)

layer l = net->layers[0];
#ifdef GPU
cl_pull_array(l.output_gpu, l.output, l.outputs);
cuda_pull_array(l.output_gpu, l.output, l.outputs);
#endif
printf("%s", line);
for(i = 0; i < l.outputs; ++i){
@@ -0,0 +1,208 @@
#include "darknet.h"

#ifdef OPENCV
image get_image_from_stream(CvCapture *cap);
image ipl_to_image(IplImage* src);

void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters);


typedef struct {
float *x;
float *y;
} float_pair;

float_pair get_rnn_vid_data(network net, char **files, int n, int batch, int steps)
{
int b;
assert(net.batch == steps + 1);
image out_im = get_network_image(&net);
int output_size = out_im.w*out_im.h*out_im.c;
printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
float *feats = calloc(net.batch*batch*output_size, sizeof(float));
for(b = 0; b < batch; ++b){
int input_size = net.w*net.h*net.c;
float *input = calloc(input_size*net.batch, sizeof(float));
char *filename = files[rand()%n];
CvCapture *cap = cvCaptureFromFile(filename);
int frames = cvGetCaptureProperty(cap, CV_CAP_PROP_FRAME_COUNT);
int index = rand() % (frames - steps - 2);
if (frames < (steps + 4)){
--b;
free(input);
continue;
}

printf("frames: %d, index: %d\n", frames, index);
cvSetCaptureProperty(cap, CV_CAP_PROP_POS_FRAMES, index);

int i;
for(i = 0; i < net.batch; ++i){
IplImage* src = cvQueryFrame(cap);
image im = ipl_to_image(src);
rgbgr_image(im);
image re = resize_image(im, net.w, net.h);
//show_image(re, "loaded");
//cvWaitKey(10);
memcpy(input + i*input_size, re.data, input_size*sizeof(float));
free_image(im);
free_image(re);
}
float *output = network_predict(&net, input);

free(input);

for(i = 0; i < net.batch; ++i){
memcpy(feats + (b + i*batch)*output_size, output + i*output_size, output_size*sizeof(float));
}

cvReleaseCapture(&cap);
}

//printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
float_pair p = {0};
p.x = feats;
p.y = feats + output_size*batch; //+ out_im.w*out_im.h*out_im.c;

return p;
}


void train_vid_rnn(char *cfgfile, char *weightfile)
{
char *train_videos = "data/vid/train.txt";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
float avg_loss = -1;
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int imgs = net->batch*net->subdivisions;
int i = *net->seen/imgs;

list *plist = get_paths(train_videos);
int N = plist->size;
char **paths = (char **)list_to_array(plist);
clock_t time;
int steps = net->time_steps;
int batch = net->batch / net->time_steps;

network *extractor = parse_network_cfg("cfg/extractor.cfg");
load_weights(extractor, "/home/pjreddie/trained/yolo-coco.conv");

while(get_current_batch(net) < net->max_batches){
i += 1;
time=clock();
float_pair p = get_rnn_vid_data(*extractor, paths, N, batch, steps);

copy_cpu(net->inputs*net->batch, p.x, 1, net->input, 1);
copy_cpu(net->truths*net->batch, p.y, 1, net->truth, 1);
float loss = train_network_datum(net) / (net->batch);


free(p.x);
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;

fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time));
if(i%100==0){
char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff);
}
if(i%10==0){
char buff[256];
sprintf(buff, "%s/%s.backup", backup_directory, base);
save_weights(net, buff);
}
}
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
}


image save_reconstruction(network net, image *init, float *feat, char *name, int i)
{
image recon;
if (init) {
recon = copy_image(*init);
} else {
recon = make_random_image(net.w, net.h, 3);
}

image update = make_image(net.w, net.h, 3);
reconstruct_picture(net, feat, recon, update, .01, .9, .1, 2, 50);
char buff[256];
sprintf(buff, "%s%d", name, i);
save_image(recon, buff);
free_image(update);
return recon;
}

void generate_vid_rnn(char *cfgfile, char *weightfile)
{
network *extractor = parse_network_cfg("cfg/extractor.recon.cfg");
load_weights(extractor, "/home/pjreddie/trained/yolo-coco.conv");

network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
set_batch_network(extractor, 1);
set_batch_network(net, 1);

int i;
CvCapture *cap = cvCaptureFromFile("/extra/vid/ILSVRC2015/Data/VID/snippets/val/ILSVRC2015_val_00007030.mp4");
float *feat;
float *next;
image last;
for(i = 0; i < 25; ++i){
image im = get_image_from_stream(cap);
image re = resize_image(im, extractor->w, extractor->h);
feat = network_predict(extractor, re.data);
if(i > 0){
printf("%f %f\n", mean_array(feat, 14*14*512), variance_array(feat, 14*14*512));
printf("%f %f\n", mean_array(next, 14*14*512), variance_array(next, 14*14*512));
printf("%f\n", mse_array(feat, 14*14*512));
axpy_cpu(14*14*512, -1, feat, 1, next, 1);
printf("%f\n", mse_array(next, 14*14*512));
}
next = network_predict(net, feat);

free_image(im);

free_image(save_reconstruction(*extractor, 0, feat, "feat", i));
free_image(save_reconstruction(*extractor, 0, next, "next", i));
if (i==24) last = copy_image(re);
free_image(re);
}
for(i = 0; i < 30; ++i){
next = network_predict(net, next);
image new = save_reconstruction(*extractor, &last, next, "new", i);
free_image(last);
last = new;
}
}

void run_vid_rnn(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
//char *filename = (argc > 5) ? argv[5]: 0;
if(0==strcmp(argv[2], "train")) train_vid_rnn(cfg, weights);
else if(0==strcmp(argv[2], "generate")) generate_vid_rnn(cfg, weights);
}
#else
void run_vid_rnn(int argc, char **argv){}
#endif

@@ -1,5 +1,5 @@
#include "darknet.h"
//#include <sys/time.h>
#include <sys/time.h>
#include <assert.h>

void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display)
@@ -10,14 +10,14 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
char *base = basecfg(cfgfile);
printf("%s\n", base);
printf("%d\n", ngpus);
network **nets = (network **)calloc(ngpus, sizeof(network*));
network **nets = calloc(ngpus, sizeof(network*));

srand(time(0));
int seed = rand();
for(i = 0; i < ngpus; ++i){
srand(seed);
#ifdef GPU
cl_set_device(gpus[i]);
cuda_set_device(gpus[i]);
#endif
nets[i] = load_network(cfgfile, weightfile, clear);
nets[i]->learning_rate *= ngpus;
@@ -35,8 +35,8 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
list *options = read_data_cfg(datacfg);

const char *backup_directory = option_find_str(options, "backup", "/backup/");
const char *train_list = option_find_str(options, "train", "data/train.list");
char *backup_directory = option_find_str(options, "backup", "/backup/");
char *train_list = option_find_str(options, "train", "data/train.list");

list *plist = get_paths(train_list);
char **paths = (char **)list_to_array(plist);
@@ -67,15 +67,15 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,

data train;
data buffer;
std::thread load_thread;
pthread_t load_thread;
args.d = &buffer;
load_thread = load_data(args);

int epoch = (*net->seen)/N;
while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
time=clock();

load_thread.join();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data(args);

@@ -122,14 +122,12 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
save_weights(net, buff);
}
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s.weights", backup_directory, base);
save_weights(net, buff);

free_network(net);
free_ptrs((char**)paths, plist->size);
free_ptrs((void**)paths, plist->size);
free_list(plist);
free(base);
}
@@ -198,8 +196,8 @@ void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, cons
float fps = 0;

while(1){
std::chrono::steady_clock::time_point start, end;
start = what_time_is_it_now();
struct timeval tval_before, tval_after, tval_result;
gettimeofday(&tval_before, NULL);

image in = get_image_from_stream(cap);
image in_s = letterbox_image(in, net->w, net->h);
@@ -220,9 +218,10 @@ void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, cons

cvWaitKey(10);

end = what_time_is_it_now();
float curr = 1.0f / sec(end, start);
fps = .9*fps + .1*curr;
gettimeofday(&tval_after, NULL);
timersub(&tval_after, &tval_before, &tval_result);
float curr = 1000000.f/((long int)tval_result.tv_usec);
fps = .9*fps + .1*curr;
}
#endif
}
@@ -247,7 +246,7 @@ void run_segmenter(int argc, char **argv)
for(i = 0; i < len; ++i){
if (gpu_list[i] == ',') ++ngpus;
}
gpus = (int*)calloc(ngpus, sizeof(int));
gpus = calloc(ngpus, sizeof(int));
for(i = 0; i < ngpus; ++i){
gpus[i] = atoi(gpu_list);
gpu_list = strchr(gpu_list, ',')+1;
@@ -2,8 +2,8 @@

void train_super(char *cfgfile, char *weightfile, int clear)
{
const char *train_images = "/data/imagenet/imagenet1k.train.list";
const char *backup_directory = "/home/pjreddie/backup/";
char *train_images = "/data/imagenet/imagenet1k.train.list";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
@@ -29,13 +29,13 @@ void train_super(char *cfgfile, char *weightfile, int clear)
args.d = &buffer;
args.type = SUPER_DATA;

std::thread load_thread = load_data_in_thread(args);
pthread_t load_thread = load_data_in_thread(args);
clock_t time;
//while(i*imgs < N*120){
while(get_current_batch(net) < net->max_batches){
i += 1;
time=clock();
load_thread.join();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

@@ -59,8 +59,6 @@ void train_super(char *cfgfile, char *weightfile, int clear)
}
free_data(train);
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
@@ -0,0 +1,83 @@
#include "darknet.h"
#include <sys/time.h>

void train_swag(char *cfgfile, char *weightfile)
{
char *train_images = "data/voc.0712.trainval";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
float avg_loss = -1;
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int imgs = net->batch*net->subdivisions;
int i = *net->seen/imgs;
data train, buffer;

layer l = net->layers[net->n - 1];

int side = l.side;
int classes = l.classes;
float jitter = l.jitter;

list *plist = get_paths(train_images);
//int N = plist->size;
char **paths = (char **)list_to_array(plist);

load_args args = {0};
args.w = net->w;
args.h = net->h;
args.paths = paths;
args.n = imgs;
args.m = plist->size;
args.classes = classes;
args.jitter = jitter;
args.num_boxes = side;
args.d = &buffer;
args.type = REGION_DATA;

pthread_t load_thread = load_data_in_thread(args);
clock_t time;
//while(i*imgs < N*120){
while(get_current_batch(net) < net->max_batches){
i += 1;
time=clock();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

printf("Loaded: %lf seconds\n", sec(clock()-time));

time=clock();
float loss = train_network(net, train);
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;

printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
if(i%1000==0 || i == 600){
char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff);
}
free_data(train);
}
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
}

void run_swag(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
if(0==strcmp(argv[2], "train")) train_swag(cfg, weights);
}
@@ -5,7 +5,7 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
const char *backup_directory = "/home/pjreddie/backup/";
char *backup_directory = "/home/pjreddie/backup/";
printf("%s\n", base);
network *net = load_network(cfgfile, weightfile, clear);
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
@@ -15,7 +15,7 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
printf("%d\n", plist->size);
int N = plist->size;
clock_t time;
std::thread load_thread;
pthread_t load_thread;
data train;
data buffer;

@@ -45,7 +45,7 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
int epoch = (*net->seen)/N;
while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
time=clock();
load_thread.join();
pthread_join(load_thread, 0);
train = buffer;

load_thread = load_data_in_thread(args);
@@ -68,15 +68,14 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
save_weights(net, buff);
}
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s.weights", backup_directory, base);
save_weights(net, buff);

pthread_join(load_thread, 0);
free_data(buffer);
free_network(net);
free_ptrs((char**)paths, plist->size);
free_ptrs((void**)paths, plist->size);
free_list(plist);
free(base);
}
@@ -0,0 +1,161 @@
#include "darknet.h"

void extract_voxel(char *lfile, char *rfile, char *prefix)
{
#ifdef OPENCV
int w = 1920;
int h = 1080;
int shift = 0;
int count = 0;
CvCapture *lcap = cvCaptureFromFile(lfile);
CvCapture *rcap = cvCaptureFromFile(rfile);
while(1){
image l = get_image_from_stream(lcap);
image r = get_image_from_stream(rcap);
if(!l.w || !r.w) break;
if(count%100 == 0) {
shift = best_3d_shift_r(l, r, -l.h/100, l.h/100);
printf("%d\n", shift);
}
image ls = crop_image(l, (l.w - w)/2, (l.h - h)/2, w, h);
image rs = crop_image(r, 105 + (r.w - w)/2, (r.h - h)/2 + shift, w, h);
char buff[256];
sprintf(buff, "%s_%05d_l", prefix, count);
save_image(ls, buff);
sprintf(buff, "%s_%05d_r", prefix, count);
save_image(rs, buff);
free_image(l);
free_image(r);
free_image(ls);
free_image(rs);
++count;
}

#else
printf("need OpenCV for extraction\n");
#endif
}

void train_voxel(char *cfgfile, char *weightfile)
{
char *train_images = "/data/imagenet/imagenet1k.train.list";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
float avg_loss = -1;
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int imgs = net->batch*net->subdivisions;
int i = *net->seen/imgs;
data train, buffer;


list *plist = get_paths(train_images);
//int N = plist->size;
char **paths = (char **)list_to_array(plist);

load_args args = {0};
args.w = net->w;
args.h = net->h;
args.scale = 4;
args.paths = paths;
args.n = imgs;
args.m = plist->size;
args.d = &buffer;
args.type = SUPER_DATA;

pthread_t load_thread = load_data_in_thread(args);
clock_t time;
//while(i*imgs < N*120){
while(get_current_batch(net) < net->max_batches){
i += 1;
time=clock();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

printf("Loaded: %lf seconds\n", sec(clock()-time));

time=clock();
float loss = train_network(net, train);
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;

printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
if(i%1000==0){
char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff);
}
if(i%100==0){
char buff[256];
sprintf(buff, "%s/%s.backup", backup_directory, base);
save_weights(net, buff);
}
free_data(train);
}
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
}

void test_voxel(char *cfgfile, char *weightfile, char *filename)
{
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
set_batch_network(&net, 1);
srand(2222222);

clock_t time;
char buff[256];
char *input = buff;
while(1){
if(filename){
strncpy(input, filename, 256);
}else{
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
image im = load_image_color(input, 0, 0);
resize_network(&net, im.w, im.h);
printf("%d %d\n", im.w, im.h);

float *X = im.data;
time=clock();
network_predict(net, X);
image out = get_network_image(net);
printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
save_image(out, "out");

free_image(im);
if (filename) break;
}
}


void run_voxel(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5] : 0;
if(0==strcmp(argv[2], "train")) train_voxel(cfg, weights);
else if(0==strcmp(argv[2], "test")) test_voxel(cfg, weights, filename);
else if(0==strcmp(argv[2], "extract")) extract_voxel(argv[3], argv[4], argv[5]);
/*
else if(0==strcmp(argv[2], "valid")) validate_voxel(cfg, weights);
*/
}
@@ -0,0 +1,144 @@
#include "darknet.h"

void train_writing(char *cfgfile, char *weightfile)
{
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
printf("%s\n", base);
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int imgs = net->batch*net->subdivisions;
list *plist = get_paths("figures.list");
char **paths = (char **)list_to_array(plist);
clock_t time;
int N = plist->size;
printf("N: %d\n", N);
image out = get_network_image(net);

data train, buffer;

load_args args = {0};
args.w = net->w;
args.h = net->h;
args.out_w = out.w;
args.out_h = out.h;
args.paths = paths;
args.n = imgs;
args.m = N;
args.d = &buffer;
args.type = WRITING_DATA;

pthread_t load_thread = load_data_in_thread(args);
int epoch = (*net->seen)/N;
while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
time=clock();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);
printf("Loaded %lf seconds\n",sec(clock()-time));

time=clock();
float loss = train_network(net, train);

/*
image pred = float_to_image(64, 64, 1, out);
print_image(pred);
*/

/*
image im = float_to_image(256, 256, 3, train.X.vals[0]);
image lab = float_to_image(64, 64, 1, train.y.vals[0]);
image pred = float_to_image(64, 64, 1, out);
show_image(im, "image");
show_image(lab, "label");
print_image(lab);
show_image(pred, "pred");
cvWaitKey(0);
*/

if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
free_data(train);
if(get_current_batch(net)%100 == 0){
char buff[256];
sprintf(buff, "%s/%s_batch_%ld.weights", backup_directory, base, get_current_batch(net));
save_weights(net, buff);
}
if(*net->seen/N > epoch){
epoch = *net->seen/N;
char buff[256];
sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
save_weights(net, buff);
}
}
}

void test_writing(char *cfgfile, char *weightfile, char *filename)
{
network *net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(net, weightfile);
}
set_batch_network(net, 1);
srand(2222222);
clock_t time;
char buff[256];
char *input = buff;
while(1){
if(filename){
strncpy(input, filename, 256);
}else{
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}

image im = load_image_color(input, 0, 0);
resize_network(net, im.w, im.h);
printf("%d %d %d\n", im.h, im.w, im.c);
float *X = im.data;
time=clock();
network_predict(net, X);
printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
image pred = get_network_image(net);

image upsampled = resize_image(pred, im.w, im.h);
image thresh = threshold_image(upsampled, .5);
pred = thresh;

show_image(pred, "prediction");
show_image(im, "orig");
#ifdef OPENCV
cvWaitKey(0);
cvDestroyAllWindows();
#endif

free_image(upsampled);
free_image(thresh);
free_image(im);
if (filename) break;
}
}

void run_writing(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5] : 0;
if(0==strcmp(argv[2], "train")) train_writing(cfg, weights);
else if(0==strcmp(argv[2], "test")) test_writing(cfg, weights, filename);
}

@@ -1,11 +1,11 @@
#include "darknet.h"

const char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};

void train_yolo(char *cfgfile, char *weightfile)
{
const char *train_images = "/data/voc/train.txt";
const char *backup_directory = "/home/pjreddie/backup/";
char *train_images = "/data/voc/train.txt";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
@@ -44,13 +44,13 @@ void train_yolo(char *cfgfile, char *weightfile)
args.saturation = net->saturation;
args.hue = net->hue;

std::thread load_thread = load_data_in_thread(args);
pthread_t load_thread = load_data_in_thread(args);
clock_t time;
//while(i*imgs < N*120){
while(get_current_batch(net) < net->max_batches){
i += 1;
time=clock();
load_thread.join();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

@@ -69,8 +69,6 @@ void train_yolo(char *cfgfile, char *weightfile)
}
free_data(train);
}
if (load_thread.joinable())
load_thread.join();
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
@@ -104,7 +102,7 @@ void validate_yolo(char *cfg, char *weights)
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
srand(time(0));

const char *base = "results/comp4_det_test_";
char *base = "results/comp4_det_test_";
//list *plist = get_paths("data/voc.2007.test");
list *plist = get_paths("/home/pjreddie/data/voc/2007_test.txt");
//list *plist = get_paths("data/voc.2012.test");
@@ -114,7 +112,7 @@ void validate_yolo(char *cfg, char *weights)
int classes = l.classes;

int j;
FILE **fps = (FILE **)calloc(classes, sizeof(FILE *));
FILE **fps = calloc(classes, sizeof(FILE *));
for(j = 0; j < classes; ++j){
char buff[1024];
snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
@@ -130,11 +128,11 @@ void validate_yolo(char *cfg, char *weights)
float iou_thresh = .5;

int nthreads = 8;
image *val = (image *)calloc(nthreads, sizeof(image));
image *val_resized = (image *)calloc(nthreads, sizeof(image));
image *buf = (image *)calloc(nthreads, sizeof(image));
image *buf_resized = (image *)calloc(nthreads, sizeof(image));
std::thread *thr = new std::thread[nthreads];
image *val = calloc(nthreads, sizeof(image));
image *val_resized = calloc(nthreads, sizeof(image));
image *buf = calloc(nthreads, sizeof(image));
image *buf_resized = calloc(nthreads, sizeof(image));
pthread_t *thr = calloc(nthreads, sizeof(pthread_t));

load_args args = {0};
args.w = net->w;
@@ -151,7 +149,7 @@ void validate_yolo(char *cfg, char *weights)
for(i = nthreads; i < m+nthreads; i += nthreads){
fprintf(stderr, "%d\n", i);
for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
thr[t].join();
pthread_join(thr[t], 0);
val[t] = buf[t];
val_resized[t] = buf_resized[t];
}
@@ -178,7 +176,6 @@ void validate_yolo(char *cfg, char *weights)
free_image(val_resized[t]);
}
}
delete[]thr;
fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
}

@@ -189,7 +186,7 @@ void validate_yolo_recall(char *cfg, char *weights)
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
srand(time(0));

const char *base = "results/comp4_det_test_";
char *base = "results/comp4_det_test_";
list *plist = get_paths("data/voc.2007.test");
char **paths = (char **)list_to_array(plist);

@@ -198,7 +195,7 @@ void validate_yolo_recall(char *cfg, char *weights)
int side = l.side;

int j, k;
FILE **fps = (FILE **)calloc(classes, sizeof(FILE *));
FILE **fps = calloc(classes, sizeof(FILE *));
for(j = 0; j < classes; ++j){
char buff[1024];
snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
@@ -292,14 +289,14 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
time=clock();
network_predict(net, X);
printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));

int nboxes = 0;
detection *dets = get_network_boxes(net, 1, 1, thresh, 0, 0, 0, &nboxes);
if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
draw_detections(im, dets, l.side*l.side*l.n, thresh, const_cast<char**>(voc_names), alphabet, 20);

draw_detections(im, dets, l.side*l.side*l.n, thresh, voc_names, alphabet, 20);
save_image(im, "predictions");
show_image(im, "predictions");

free_detections(dets, nboxes);
free_image(im);
free_image(sized);
@@ -326,10 +323,9 @@ void run_yolo(int argc, char **argv)
char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5]: 0;
char** voc_class_names = const_cast<char**>(voc_names);
if(0==strcmp(argv[2], "test")) test_yolo(cfg, weights, filename, thresh);
else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_class_names, 20, frame_skip, prefix, avg, .5, 0,0,0,0);
else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, avg, .5, 0,0,0,0);
}

Large diffs are not rendered by default.

@@ -0,0 +1,200 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "activations.h"
#include "cuda.h"
}


__device__ float lhtan_activate_kernel(float x)
{
if(x < 0) return .001f*x;
if(x > 1) return .001f*(x-1.f) + 1.f;
return x;
}
__device__ float lhtan_gradient_kernel(float x)
{
if(x > 0 && x < 1) return 1;
return .001;
}

__device__ float hardtan_activate_kernel(float x)
{
if (x < -1) return -1;
if (x > 1) return 1;
return x;
}
__device__ float linear_activate_kernel(float x){return x;}
__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
__device__ float relu_activate_kernel(float x){return x*(x>0);}
__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
__device__ float plse_activate_kernel(float x)
{
if(x < -4) return .01f * (x + 4);
if(x > 4) return .01f * (x - 4) + 1;
return .125f*x + .5f;
}
__device__ float stair_activate_kernel(float x)
{
int n = floorf(x);
if (n%2 == 0) return floorf(x/2);
else return (x - n) + floorf(x/2);
}


__device__ float hardtan_gradient_kernel(float x)
{
if (x > -1 && x < 1) return 1;
return 0;
}
__device__ float linear_gradient_kernel(float x){return 1;}
__device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
__device__ float loggy_gradient_kernel(float x)
{
float y = (x+1)/2;
return 2*(1-y)*y;
}
__device__ float relu_gradient_kernel(float x){return (x>0);}
__device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
__device__ float tanh_gradient_kernel(float x){return 1-x*x;}
__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
__device__ float stair_gradient_kernel(float x)
{
if (floorf(x) == x) return 0;
return 1;
}

__device__ float activate_kernel(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_activate_kernel(x);
case LOGISTIC:
return logistic_activate_kernel(x);
case LOGGY:
return loggy_activate_kernel(x);
case RELU:
return relu_activate_kernel(x);
case ELU:
return elu_activate_kernel(x);
case RELIE:
return relie_activate_kernel(x);
case RAMP:
return ramp_activate_kernel(x);
case LEAKY:
return leaky_activate_kernel(x);
case TANH:
return tanh_activate_kernel(x);
case PLSE:
return plse_activate_kernel(x);
case STAIR:
return stair_activate_kernel(x);
case HARDTAN:
return hardtan_activate_kernel(x);
case LHTAN:
return lhtan_activate_kernel(x);
}
return 0;
}

__device__ float gradient_kernel(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_gradient_kernel(x);
case LOGISTIC:
return logistic_gradient_kernel(x);
case LOGGY:
return loggy_gradient_kernel(x);
case RELU:
return relu_gradient_kernel(x);
case ELU:
return elu_gradient_kernel(x);
case RELIE:
return relie_gradient_kernel(x);
case RAMP:
return ramp_gradient_kernel(x);
case LEAKY:
return leaky_gradient_kernel(x);
case TANH:
return tanh_gradient_kernel(x);
case PLSE:
return plse_gradient_kernel(x);
case STAIR:
return stair_gradient_kernel(x);
case HARDTAN:
return hardtan_gradient_kernel(x);
case LHTAN:
return lhtan_gradient_kernel(x);
}
return 0;
}

__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
int i = id % s;
int b = id / s;
float x1 = x[b*s + i];
float x2 = x[b*s + s/2 + i];
if(id < n) {
float de = dy[id];
dx[b*s + i] = x2*de;
dx[b*s + s/2 + i] = x1*de;
}
}

extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y)
{
binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y);
check_error(cudaPeekAtLastError());
}
__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
int i = id % s;
int b = id / s;
float x1 = x[b*s + i];
float x2 = x[b*s + s/2 + i];
if(id < n) y[id] = x1*x2;
}

extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y)
{
binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y);
check_error(cudaPeekAtLastError());
}

__global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n) x[i] = activate_kernel(x[i], a);
}

__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n) delta[i] *= gradient_kernel(x[i], a);
}

extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a)
{
activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
check_error(cudaPeekAtLastError());
}

extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
{
gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
check_error(cudaPeekAtLastError());
}
@@ -0,0 +1,61 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "avgpool_layer.h"
#include "cuda.h"
}

__global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= n) return;

int k = id % c;
id /= c;
int b = id;

int i;
int out_index = (k + c*b);
output[out_index] = 0;
for(i = 0; i < w*h; ++i){
int in_index = i + h*w*(k + b*c);
output[out_index] += input[in_index];
}
output[out_index] /= w*h;
}

__global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= n) return;

int k = id % c;
id /= c;
int b = id;

int i;
int out_index = (k + c*b);
for(i = 0; i < w*h; ++i){
int in_index = i + h*w*(k + b*c);
in_delta[in_index] += out_delta[out_index] / (w*h);
}
}

extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net)
{
size_t n = layer.c*layer.batch;

forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu);
check_error(cudaPeekAtLastError());
}

extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net)
{
size_t n = layer.c*layer.batch;

backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu);
check_error(cudaPeekAtLastError());
}

Large diffs are not rendered by default.

@@ -0,0 +1,59 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"
#include "darknet.h"

extern "C" {
#include "col2im.h"
#include "cuda.h"
}

// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE

__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize,
const int pad,
const int stride,
const int height_col, const int width_col,
float *data_im) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x){
float val = 0;
int w = index % width + pad;
int h = (index / width) % height + pad;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
int w_col_end = min(w / stride + 1, width_col);
int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
int h_col_end = min(h / stride + 1, height_col);
// equivalent implementation
int offset =
(c * ksize * ksize + h * ksize + w) * height_col * width_col;
int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
int coeff_w_col = (1 - stride * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[index] += val;
}
}

void col2im_gpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int num_kernels = channels * height * width;
col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
BLOCK>>>(
num_kernels, data_col, height, width, ksize, pad,
stride, height_col,
width_col, data_im);
}

@@ -0,0 +1,330 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "gemm.h"
#include "blas.h"
#include "im2col.h"
#include "col2im.h"
#include "utils.h"
#include "cuda.h"
}

__global__ void binarize_kernel(float *x, int n, float *binary)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (i >= n) return;
binary[i] = (x[i] >= 0) ? 1 : -1;
}

void binarize_gpu(float *x, int n, float *binary)
{
binarize_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, binary);
check_error(cudaPeekAtLastError());
}

__global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
{
int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (s >= size) return;
int i = 0;
float mean = 0;
for(i = 0; i < n; ++i){
mean += fabsf(input[i*size + s]);
}
mean = mean / n;
for(i = 0; i < n; ++i){
binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
}
}

void binarize_input_gpu(float *input, int n, int size, float *binary)
{
binarize_input_kernel<<<cuda_gridsize(size), BLOCK>>>(input, n, size, binary);
check_error(cudaPeekAtLastError());
}


__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
{
int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (f >= n) return;
int i = 0;
float mean = 0;
for(i = 0; i < size; ++i){
mean += fabsf(weights[f*size + i]);
}
mean = mean / size;
for(i = 0; i < size; ++i){
binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
/*binary[f*size + i] = weights[f*size + i];*/
}
}

void binarize_weights_gpu(float *weights, int n, int size, float *binary)
{
binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
check_error(cudaPeekAtLastError());
}

void forward_convolutional_layer_gpu(convolutional_layer l, network net)
{
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
}

if(l.xnor){
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
net.input_gpu = l.binary_input_gpu;
}

#ifdef CUDNN
float one = 1;
cudnnConvolutionForward(cudnn_handle(),
&one,
l.srcTensorDesc,
net.input_gpu,
l.weightDesc,
l.weights_gpu,
l.convDesc,
l.fw_algo,
net.workspace,
l.workspace_size,
&one,
l.dstTensorDesc,
l.output_gpu);

#else
int i, j;
int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
for(j = 0; j < l.groups; ++j){
float *a = l.weights_gpu + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output_gpu + (i*l.groups + j)*n*m;
float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w;

if (l.size == 1){
b = im;
} else {
im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
}
gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
#endif

if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, net);
} else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}

activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
//if(l.dot > 0) dot_error_gpu(l);
if(l.binary || l.xnor) swap_binary(&l);
}

__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= n) return;

int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;

int w_offset = -(size/2.f);
int h_offset = -(size/2.f);

int out_index = j + w*(i + h*(k + c*b));
int l, m;
for(l = 0; l < size; ++l){
for(m = 0; m < size; ++m){
int cur_h = h_offset + i + l;
int cur_w = w_offset + j + m;
int index = cur_w + w*(cur_h + h*(k + b*c));
int valid = (cur_h >= 0 && cur_h < h &&
cur_w >= 0 && cur_w < w);
delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
}
}
}

extern "C" void smooth_layer(layer l, int size, float rate)
{
int h = l.out_h;
int w = l.out_w;
int c = l.out_c;

size_t n = h*w*c*l.batch;

smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
check_error(cudaPeekAtLastError());
}

void backward_convolutional_layer_gpu(convolutional_layer l, network net)
{
if(l.smooth){
smooth_layer(l, 5, l.smooth);
}
//constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);


if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net);
} else {
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
}
float *original_input = net.input_gpu;

if(l.xnor) net.input_gpu = l.binary_input_gpu;
#ifdef CUDNN
float one = 1;
cudnnConvolutionBackwardFilter(cudnn_handle(),
&one,
l.srcTensorDesc,
net.input_gpu,
l.ddstTensorDesc,
l.delta_gpu,
l.convDesc,
l.bf_algo,
net.workspace,
l.workspace_size,
&one,
l.dweightDesc,
l.weight_updates_gpu);

if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
cudnnConvolutionBackwardData(cudnn_handle(),
&one,
l.weightDesc,
l.weights_gpu,
l.ddstTensorDesc,
l.delta_gpu,
l.convDesc,
l.bd_algo,
net.workspace,
l.workspace_size,
&one,
l.dsrcTensorDesc,
net.delta_gpu);
if(l.binary || l.xnor) swap_binary(&l);
if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
}

#else
int m = l.n/l.groups;
int n = l.size*l.size*l.c/l.groups;
int k = l.out_w*l.out_h;

int i, j;
for(i = 0; i < l.batch; ++i){
for(j = 0; j < l.groups; ++j){
float *a = l.delta_gpu + (i*l.groups + j)*m*k;
float *b = net.workspace;
float *c = l.weight_updates_gpu + j*l.nweights/l.groups;

float *im = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;

im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

if (net.delta_gpu) {
if (l.binary || l.xnor) swap_binary(&l);
a = l.weights_gpu + j*l.nweights/l.groups;
b = l.delta_gpu + (i*l.groups + j)*m*k;
c = net.workspace;
if (l.size == 1) {
c = imd;
}

gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);

if (l.size != 1) {
col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
}
if(l.binary || l.xnor) {
swap_binary(&l);
}
}
if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
}
}
#endif
}

void pull_convolutional_layer(layer l)
{
cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
cuda_pull_array(l.biases_gpu, l.biases, l.n);
cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_pull_array(l.scales_gpu, l.scales, l.n);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void push_convolutional_layer(layer l)
{
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_push_array(l.scales_gpu, l.scales, l.n);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void update_convolutional_layer_gpu(layer l, update_args a)
{
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;

if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

if(l.scales_gpu){
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
}
}
if(l.clip){
constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
}
}


@@ -0,0 +1,230 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "crop_layer.h"
#include "utils.h"
#include "cuda.h"
#include "image.h"
}

__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c)
{
if(x < 0 || x >= w || y < 0 || y >= h) return 0;
return image[x + w*(y + c*h)];
}

__device__ float3 rgb_to_hsv_kernel(float3 rgb)
{
float r = rgb.x;
float g = rgb.y;
float b = rgb.z;

float h, s, v;
float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b);
float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b);
float delta = max - min;
v = max;
if(max == 0){
s = 0;
h = -1;
}else{
s = delta/max;
if(r == max){
h = (g - b) / delta;
} else if (g == max) {
h = 2 + (b - r) / delta;
} else {
h = 4 + (r - g) / delta;
}
if (h < 0) h += 6;
}
return make_float3(h, s, v);
}

__device__ float3 hsv_to_rgb_kernel(float3 hsv)
{
float h = hsv.x;
float s = hsv.y;
float v = hsv.z;

float r, g, b;
float f, p, q, t;

if (s == 0) {
r = g = b = v;
} else {
int index = (int) floorf(h);
f = h - index;
p = v*(1-s);
q = v*(1-s*f);
t = v*(1-s*(1-f));
if(index == 0){
r = v; g = t; b = p;
} else if(index == 1){
r = q; g = v; b = p;
} else if(index == 2){
r = p; g = v; b = t;
} else if(index == 3){
r = p; g = q; b = v;
} else if(index == 4){
r = t; g = p; b = v;
} else {
r = v; g = p; b = q;
}
}
r = (r < 0) ? 0 : ((r > 1) ? 1 : r);
g = (g < 0) ? 0 : ((g > 1) ? 1 : g);
b = (b < 0) ? 0 : ((b > 1) ? 1 : b);
return make_float3(r, g, b);
}

__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c)
{
int ix = (int) floorf(x);
int iy = (int) floorf(y);

float dx = x - ix;
float dy = y - iy;

float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) +
dy * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) +
(1-dy) * dx * get_pixel_kernel(image, w, h, ix+1, iy, c) +
dy * dx * get_pixel_kernel(image, w, h, ix+1, iy+1, c);
return val;
}

__global__ void levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift)
{
int size = batch * w * h;
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= size) return;
int x = id % w;
id /= w;
int y = id % h;
id /= h;
float rshift = rand[0];
float gshift = rand[1];
float bshift = rand[2];
float r0 = rand[8*id + 0];
float r1 = rand[8*id + 1];
float r2 = rand[8*id + 2];
float r3 = rand[8*id + 3];

saturation = r0*(saturation - 1) + 1;
saturation = (r1 > .5f) ? 1.f/saturation : saturation;
exposure = r2*(exposure - 1) + 1;
exposure = (r3 > .5f) ? 1.f/exposure : exposure;

size_t offset = id * h * w * 3;
image += offset;
float r = image[x + w*(y + h*0)];
float g = image[x + w*(y + h*1)];
float b = image[x + w*(y + h*2)];
float3 rgb = make_float3(r,g,b);
if(train){
float3 hsv = rgb_to_hsv_kernel(rgb);
hsv.y *= saturation;
hsv.z *= exposure;
rgb = hsv_to_rgb_kernel(hsv);
} else {
shift = 0;
}
image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
}

__global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= size) return;

float cx = w/2.f;
float cy = h/2.f;

int count = id;
int j = id % crop_width;
id /= crop_width;
int i = id % crop_height;
id /= crop_height;
int k = id % c;
id /= c;
int b = id;

float r4 = rand[8*b + 4];
float r5 = rand[8*b + 5];
float r6 = rand[8*b + 6];
float r7 = rand[8*b + 7];

float dw = (w - crop_width)*r4;
float dh = (h - crop_height)*r5;
flip = (flip && (r6 > .5f));
angle = 2*angle*r7 - angle;
if(!train){
dw = (w - crop_width)/2.f;
dh = (h - crop_height)/2.f;
flip = 0;
angle = 0;
}

input += w*h*c*b;

float x = (flip) ? w - dw - j - 1 : j + dw;
float y = i + dh;

float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;

output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
}

extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
{
void cuda_random(float *x_gpu, size_t n);
float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
dim3 cuda_gridsize(size_t n);
void check_error(cudaError_t status);

cuda_random(layer.rand_gpu, layer.batch*8);

float radians = layer.angle*3.14159265f/180.f;

float scale = 2;
float translate = -1;
if(layer.noadjust){
scale = 1;
translate = 0;
}

int size = layer.batch * layer.w * layer.h;

levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
check_error(cudaPeekAtLastError());

size = layer.batch*layer.c*layer.out_w*layer.out_h;

forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu);
check_error(cudaPeekAtLastError());

/*
cuda_pull_array(layer.output_gpu, layer.output, size);
image im = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 0*(size/layer.batch));
image im2 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 1*(size/layer.batch));
image im3 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 2*(size/layer.batch));
translate_image(im, -translate);
scale_image(im, 1/scale);
translate_image(im2, -translate);
scale_image(im2, 1/scale);
translate_image(im3, -translate);
scale_image(im3, 1/scale);
show_image(im, "cropped");
show_image(im2, "cropped2");
show_image(im3, "cropped3");
cvWaitKey(0);
*/
}

@@ -0,0 +1,139 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "convolutional_layer.h"
#include "deconvolutional_layer.h"
#include "batchnorm_layer.h"
#include "gemm.h"
#include "blas.h"
#include "im2col.h"
#include "col2im.h"
#include "utils.h"
#include "cuda.h"
}

extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
{
int i;

int m = l.size*l.size*l.n;
int n = l.h*l.w;
int k = l.c;

fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);

for(i = 0; i < l.batch; ++i){
float *a = l.weights_gpu;
float *b = net.input_gpu + i*l.c*l.h*l.w;
float *c = net.workspace;

gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);

col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
}
if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, net);
} else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
}

extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
{
int i;

//constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);

if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net);
} else {
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
}

//if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float));

for(i = 0; i < l.batch; ++i){
int m = l.c;
int n = l.size*l.size*l.n;
int k = l.h*l.w;

float *a = net.input_gpu + i*m*k;
float *b = net.workspace;
float *c = l.weight_updates_gpu;

im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w,
l.size, l.stride, l.pad, b);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

if(net.delta_gpu){
int m = l.c;
int n = l.h*l.w;
int k = l.size*l.size*l.n;

float *a = l.weights_gpu;
float *b = net.workspace;
float *c = net.delta_gpu + i*n*m;

gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
}

extern "C" void pull_deconvolutional_layer(layer l)
{
cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
cuda_pull_array(l.biases_gpu, l.biases, l.n);
cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_pull_array(l.scales_gpu, l.scales, l.n);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

extern "C" void push_deconvolutional_layer(layer l)
{
cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_push_array(l.scales_gpu, l.scales, l.n);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void update_deconvolutional_layer_gpu(layer l, update_args a)
{
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;

if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

if(l.scales_gpu){
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
}
}
}

@@ -0,0 +1,47 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "dropout_layer.h"
#include "utils.h"
#include "cuda.h"
void check_error(cudaError_t status);
cublasHandle_t blas_handle();
int *cuda_make_int_array(int *x, size_t n);
void cuda_random(float *x_gpu, size_t n);
float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
dim3 cuda_gridsize(size_t n);
}

__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
}

void forward_dropout_layer_gpu(dropout_layer layer, network net)
{
if (!net.train) return;
int size = layer.inputs*layer.batch;
cuda_random(layer.rand_gpu, size);
/*
int i;
for(i = 0; i < size; ++i){
layer.rand[i] = rand_uniform();
}
cuda_push_array(layer.rand_gpu, layer.rand, size);
*/

yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
check_error(cudaPeekAtLastError());
}

void backward_dropout_layer_gpu(dropout_layer layer, network net)
{
if(!net.delta_gpu) return;
int size = layer.inputs*layer.batch;

yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
check_error(cudaPeekAtLastError());
}
@@ -0,0 +1,62 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "im2col.h"
#include "cuda.h"
#include "darknet.h"
}

// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE

__global__ void im2col_gpu_kernel(const int n, const float* data_im,
const int height, const int width, const int ksize,
const int pad,
const int stride,
const int height_col, const int width_col,
float *data_col) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x){
int w_out = index % width_col;
int h_index = index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
int channel_out = channel_in * ksize * ksize;
int h_in = h_out * stride - pad;
int w_in = w_out * stride - pad;
float* data_col_ptr = data_col;
data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
const float* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
for (int i = 0; i < ksize; ++i) {
for (int j = 0; j < ksize; ++j) {
int h = h_in + i;
int w = w_in + j;

*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
data_im_ptr[i * width + j] : 0;

//*data_col_ptr = data_im_ptr[ii * width + jj];

data_col_ptr += height_col * width_col;
}
}
}
}

void im2col_gpu(float *im,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_col){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int num_kernels = channels * height_col * width_col;
im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
BLOCK>>>(
num_kernels, im, height, width, ksize, pad,
stride, height_col,
width_col, data_col);
}