| @@ -0,0 +1,56 @@ | ||
| # Stupid python path shit. | ||
| # Instead just add darknet.py to somewhere in your python path | ||
| # OK actually that might not be a great idea, idk, work in progress | ||
| # Use at your own risk. or don't, i don't care | ||
|
|
||
| from scipy.misc import imread | ||
| import cv2 | ||
|
|
||
| def array_to_image(arr): | ||
| arr = arr.transpose(2,0,1) | ||
| c = arr.shape[0] | ||
| h = arr.shape[1] | ||
| w = arr.shape[2] | ||
| arr = (arr/255.0).flatten() | ||
| data = dn.c_array(dn.c_float, arr) | ||
| im = dn.IMAGE(w,h,c,data) | ||
| return im | ||
|
|
||
| def detect2(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45): | ||
| boxes = dn.make_boxes(net) | ||
| probs = dn.make_probs(net) | ||
| num = dn.num_boxes(net) | ||
| dn.network_detect(net, image, thresh, hier_thresh, nms, boxes, probs) | ||
| res = [] | ||
| for j in range(num): | ||
| for i in range(meta.classes): | ||
| if probs[j][i] > 0: | ||
| res.append((meta.names[i], probs[j][i], (boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h))) | ||
| res = sorted(res, key=lambda x: -x[1]) | ||
| dn.free_ptrs(dn.cast(probs, dn.POINTER(dn.c_void_p)), num) | ||
| return res | ||
|
|
||
| import sys, os | ||
| sys.path.append(os.path.join(os.getcwd(),'python/')) | ||
|
|
||
| import darknet as dn | ||
|
|
||
| # Darknet | ||
| net = dn.load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0) | ||
| meta = dn.load_meta("cfg/coco.data") | ||
| r = dn.detect(net, meta, "data/dog.jpg") | ||
| print r | ||
|
|
||
| # scipy | ||
| arr= imread('data/dog.jpg') | ||
| im = array_to_image(arr) | ||
| r = detect2(net, meta, im) | ||
| print r | ||
|
|
||
| # OpenCV | ||
| arr = cv2.imread('data/dog.jpg') | ||
| im = array_to_image(arr) | ||
| dn.rgbgr_image(im) | ||
| r = detect2(net, meta, im) | ||
| print r | ||
|
|
| @@ -0,0 +1,27 @@ | ||
| # Stupid python path shit. | ||
| # Instead just add darknet.py to somewhere in your python path | ||
| # OK actually that might not be a great idea, idk, work in progress | ||
| # Use at your own risk. or don't, i don't care | ||
|
|
||
| import sys, os | ||
| sys.path.append(os.path.join(os.getcwd(),'python/')) | ||
|
|
||
| import darknet as dn | ||
| import pdb | ||
|
|
||
| dn.set_gpu(0) | ||
| net = dn.load_net("cfg/yolo-thor.cfg", "/home/pjreddie/backup/yolo-thor_final.weights", 0) | ||
| meta = dn.load_meta("cfg/thor.data") | ||
| r = dn.detect(net, meta, "data/bedroom.jpg") | ||
| print r | ||
|
|
||
| # And then down here you could detect a lot more images like: | ||
| r = dn.detect(net, meta, "data/eagle.jpg") | ||
| print r | ||
| r = dn.detect(net, meta, "data/giraffe.jpg") | ||
| print r | ||
| r = dn.detect(net, meta, "data/horses.jpg") | ||
| print r | ||
| r = dn.detect(net, meta, "data/person.jpg") | ||
| print r | ||
|
|
| @@ -0,0 +1,116 @@ | ||
| #include "darknet.h" | ||
|
|
||
| char *dice_labels[] = {"face1","face2","face3","face4","face5","face6"}; | ||
|
|
||
| void train_dice(char *cfgfile, char *weightfile) | ||
| { | ||
| srand(time(0)); | ||
| float avg_loss = -1; | ||
| char *base = basecfg(cfgfile); | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| printf("%s\n", base); | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int imgs = 1024; | ||
| int i = *net->seen/imgs; | ||
| char **labels = dice_labels; | ||
| list *plist = get_paths("data/dice/dice.train.list"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| printf("%d\n", plist->size); | ||
| clock_t time; | ||
| while(1){ | ||
| ++i; | ||
| time=clock(); | ||
| data train = load_data_old(paths, imgs, plist->size, labels, 6, net->w, net->h); | ||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
|
|
||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net->seen); | ||
| free_data(train); | ||
| if((i % 100) == 0) net->learning_rate *= .1; | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights",backup_directory,base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void validate_dice(char *filename, char *weightfile) | ||
| { | ||
| network *net = parse_network_cfg(filename); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| srand(time(0)); | ||
|
|
||
| char **labels = dice_labels; | ||
| list *plist = get_paths("data/dice/dice.val.list"); | ||
|
|
||
| char **paths = (char **)list_to_array(plist); | ||
| int m = plist->size; | ||
| free_list(plist); | ||
|
|
||
| data val = load_data_old(paths, m, 0, labels, 6, net->w, net->h); | ||
| float *acc = network_accuracies(net, val, 2); | ||
| printf("Validation Accuracy: %f, %d images\n", acc[0], m); | ||
| free_data(val); | ||
| } | ||
|
|
||
| void test_dice(char *cfgfile, char *weightfile, char *filename) | ||
| { | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| set_batch_network(net, 1); | ||
| srand(2222222); | ||
| int i = 0; | ||
| char **names = dice_labels; | ||
| char buff[256]; | ||
| char *input = buff; | ||
| int indexes[6]; | ||
| while(1){ | ||
| if(filename){ | ||
| strncpy(input, filename, 256); | ||
| }else{ | ||
| printf("Enter Image Path: "); | ||
| fflush(stdout); | ||
| input = fgets(input, 256, stdin); | ||
| if(!input) return; | ||
| strtok(input, "\n"); | ||
| } | ||
| image im = load_image_color(input, net->w, net->h); | ||
| float *X = im.data; | ||
| float *predictions = network_predict(net, X); | ||
| top_predictions(net, 6, indexes); | ||
| for(i = 0; i < 6; ++i){ | ||
| int index = indexes[i]; | ||
| printf("%s: %f\n", names[index], predictions[index]); | ||
| } | ||
| free_image(im); | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
| void run_dice(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| char *filename = (argc > 5) ? argv[5]: 0; | ||
| if(0==strcmp(argv[2], "test")) test_dice(cfg, weights, filename); | ||
| else if(0==strcmp(argv[2], "train")) train_dice(cfg, weights); | ||
| else if(0==strcmp(argv[2], "valid")) validate_dice(cfg, weights); | ||
| } | ||
|
|
| @@ -0,0 +1,208 @@ | ||
| #include "darknet.h" | ||
|
|
||
| #ifdef OPENCV | ||
| image get_image_from_stream(CvCapture *cap); | ||
| image ipl_to_image(IplImage* src); | ||
|
|
||
| void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters); | ||
|
|
||
|
|
||
| typedef struct { | ||
| float *x; | ||
| float *y; | ||
| } float_pair; | ||
|
|
||
| float_pair get_rnn_vid_data(network net, char **files, int n, int batch, int steps) | ||
| { | ||
| int b; | ||
| assert(net.batch == steps + 1); | ||
| image out_im = get_network_image(&net); | ||
| int output_size = out_im.w*out_im.h*out_im.c; | ||
| printf("%d %d %d\n", out_im.w, out_im.h, out_im.c); | ||
| float *feats = calloc(net.batch*batch*output_size, sizeof(float)); | ||
| for(b = 0; b < batch; ++b){ | ||
| int input_size = net.w*net.h*net.c; | ||
| float *input = calloc(input_size*net.batch, sizeof(float)); | ||
| char *filename = files[rand()%n]; | ||
| CvCapture *cap = cvCaptureFromFile(filename); | ||
| int frames = cvGetCaptureProperty(cap, CV_CAP_PROP_FRAME_COUNT); | ||
| int index = rand() % (frames - steps - 2); | ||
| if (frames < (steps + 4)){ | ||
| --b; | ||
| free(input); | ||
| continue; | ||
| } | ||
|
|
||
| printf("frames: %d, index: %d\n", frames, index); | ||
| cvSetCaptureProperty(cap, CV_CAP_PROP_POS_FRAMES, index); | ||
|
|
||
| int i; | ||
| for(i = 0; i < net.batch; ++i){ | ||
| IplImage* src = cvQueryFrame(cap); | ||
| image im = ipl_to_image(src); | ||
| rgbgr_image(im); | ||
| image re = resize_image(im, net.w, net.h); | ||
| //show_image(re, "loaded"); | ||
| //cvWaitKey(10); | ||
| memcpy(input + i*input_size, re.data, input_size*sizeof(float)); | ||
| free_image(im); | ||
| free_image(re); | ||
| } | ||
| float *output = network_predict(&net, input); | ||
|
|
||
| free(input); | ||
|
|
||
| for(i = 0; i < net.batch; ++i){ | ||
| memcpy(feats + (b + i*batch)*output_size, output + i*output_size, output_size*sizeof(float)); | ||
| } | ||
|
|
||
| cvReleaseCapture(&cap); | ||
| } | ||
|
|
||
| //printf("%d %d %d\n", out_im.w, out_im.h, out_im.c); | ||
| float_pair p = {0}; | ||
| p.x = feats; | ||
| p.y = feats + output_size*batch; //+ out_im.w*out_im.h*out_im.c; | ||
|
|
||
| return p; | ||
| } | ||
|
|
||
|
|
||
| void train_vid_rnn(char *cfgfile, char *weightfile) | ||
| { | ||
| char *train_videos = "data/vid/train.txt"; | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| float avg_loss = -1; | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int imgs = net->batch*net->subdivisions; | ||
| int i = *net->seen/imgs; | ||
|
|
||
| list *plist = get_paths(train_videos); | ||
| int N = plist->size; | ||
| char **paths = (char **)list_to_array(plist); | ||
| clock_t time; | ||
| int steps = net->time_steps; | ||
| int batch = net->batch / net->time_steps; | ||
|
|
||
| network *extractor = parse_network_cfg("cfg/extractor.cfg"); | ||
| load_weights(extractor, "/home/pjreddie/trained/yolo-coco.conv"); | ||
|
|
||
| while(get_current_batch(net) < net->max_batches){ | ||
| i += 1; | ||
| time=clock(); | ||
| float_pair p = get_rnn_vid_data(*extractor, paths, N, batch, steps); | ||
|
|
||
| copy_cpu(net->inputs*net->batch, p.x, 1, net->input, 1); | ||
| copy_cpu(net->truths*net->batch, p.y, 1, net->truth, 1); | ||
| float loss = train_network_datum(net) / (net->batch); | ||
|
|
||
|
|
||
| free(p.x); | ||
| if (avg_loss < 0) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
|
|
||
| fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time)); | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| if(i%10==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.backup", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_final.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
|
|
||
|
|
||
| image save_reconstruction(network net, image *init, float *feat, char *name, int i) | ||
| { | ||
| image recon; | ||
| if (init) { | ||
| recon = copy_image(*init); | ||
| } else { | ||
| recon = make_random_image(net.w, net.h, 3); | ||
| } | ||
|
|
||
| image update = make_image(net.w, net.h, 3); | ||
| reconstruct_picture(net, feat, recon, update, .01, .9, .1, 2, 50); | ||
| char buff[256]; | ||
| sprintf(buff, "%s%d", name, i); | ||
| save_image(recon, buff); | ||
| free_image(update); | ||
| return recon; | ||
| } | ||
|
|
||
| void generate_vid_rnn(char *cfgfile, char *weightfile) | ||
| { | ||
| network *extractor = parse_network_cfg("cfg/extractor.recon.cfg"); | ||
| load_weights(extractor, "/home/pjreddie/trained/yolo-coco.conv"); | ||
|
|
||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| set_batch_network(extractor, 1); | ||
| set_batch_network(net, 1); | ||
|
|
||
| int i; | ||
| CvCapture *cap = cvCaptureFromFile("/extra/vid/ILSVRC2015/Data/VID/snippets/val/ILSVRC2015_val_00007030.mp4"); | ||
| float *feat; | ||
| float *next; | ||
| image last; | ||
| for(i = 0; i < 25; ++i){ | ||
| image im = get_image_from_stream(cap); | ||
| image re = resize_image(im, extractor->w, extractor->h); | ||
| feat = network_predict(extractor, re.data); | ||
| if(i > 0){ | ||
| printf("%f %f\n", mean_array(feat, 14*14*512), variance_array(feat, 14*14*512)); | ||
| printf("%f %f\n", mean_array(next, 14*14*512), variance_array(next, 14*14*512)); | ||
| printf("%f\n", mse_array(feat, 14*14*512)); | ||
| axpy_cpu(14*14*512, -1, feat, 1, next, 1); | ||
| printf("%f\n", mse_array(next, 14*14*512)); | ||
| } | ||
| next = network_predict(net, feat); | ||
|
|
||
| free_image(im); | ||
|
|
||
| free_image(save_reconstruction(*extractor, 0, feat, "feat", i)); | ||
| free_image(save_reconstruction(*extractor, 0, next, "next", i)); | ||
| if (i==24) last = copy_image(re); | ||
| free_image(re); | ||
| } | ||
| for(i = 0; i < 30; ++i){ | ||
| next = network_predict(net, next); | ||
| image new = save_reconstruction(*extractor, &last, next, "new", i); | ||
| free_image(last); | ||
| last = new; | ||
| } | ||
| } | ||
|
|
||
| void run_vid_rnn(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| //char *filename = (argc > 5) ? argv[5]: 0; | ||
| if(0==strcmp(argv[2], "train")) train_vid_rnn(cfg, weights); | ||
| else if(0==strcmp(argv[2], "generate")) generate_vid_rnn(cfg, weights); | ||
| } | ||
| #else | ||
| void run_vid_rnn(int argc, char **argv){} | ||
| #endif | ||
|
|
| @@ -0,0 +1,83 @@ | ||
| #include "darknet.h" | ||
| #include <sys/time.h> | ||
|
|
||
| void train_swag(char *cfgfile, char *weightfile) | ||
| { | ||
| char *train_images = "data/voc.0712.trainval"; | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| float avg_loss = -1; | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int imgs = net->batch*net->subdivisions; | ||
| int i = *net->seen/imgs; | ||
| data train, buffer; | ||
|
|
||
| layer l = net->layers[net->n - 1]; | ||
|
|
||
| int side = l.side; | ||
| int classes = l.classes; | ||
| float jitter = l.jitter; | ||
|
|
||
| list *plist = get_paths(train_images); | ||
| //int N = plist->size; | ||
| char **paths = (char **)list_to_array(plist); | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net->w; | ||
| args.h = net->h; | ||
| args.paths = paths; | ||
| args.n = imgs; | ||
| args.m = plist->size; | ||
| args.classes = classes; | ||
| args.jitter = jitter; | ||
| args.num_boxes = side; | ||
| args.d = &buffer; | ||
| args.type = REGION_DATA; | ||
|
|
||
| pthread_t load_thread = load_data_in_thread(args); | ||
| clock_t time; | ||
| //while(i*imgs < N*120){ | ||
| while(get_current_batch(net) < net->max_batches){ | ||
| i += 1; | ||
| time=clock(); | ||
| pthread_join(load_thread, 0); | ||
| train = buffer; | ||
| load_thread = load_data_in_thread(args); | ||
|
|
||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
|
|
||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| if (avg_loss < 0) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
|
|
||
| printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); | ||
| if(i%1000==0 || i == 600){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| free_data(train); | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_final.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
|
|
||
| void run_swag(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| if(0==strcmp(argv[2], "train")) train_swag(cfg, weights); | ||
| } |
| @@ -0,0 +1,161 @@ | ||
| #include "darknet.h" | ||
|
|
||
| void extract_voxel(char *lfile, char *rfile, char *prefix) | ||
| { | ||
| #ifdef OPENCV | ||
| int w = 1920; | ||
| int h = 1080; | ||
| int shift = 0; | ||
| int count = 0; | ||
| CvCapture *lcap = cvCaptureFromFile(lfile); | ||
| CvCapture *rcap = cvCaptureFromFile(rfile); | ||
| while(1){ | ||
| image l = get_image_from_stream(lcap); | ||
| image r = get_image_from_stream(rcap); | ||
| if(!l.w || !r.w) break; | ||
| if(count%100 == 0) { | ||
| shift = best_3d_shift_r(l, r, -l.h/100, l.h/100); | ||
| printf("%d\n", shift); | ||
| } | ||
| image ls = crop_image(l, (l.w - w)/2, (l.h - h)/2, w, h); | ||
| image rs = crop_image(r, 105 + (r.w - w)/2, (r.h - h)/2 + shift, w, h); | ||
| char buff[256]; | ||
| sprintf(buff, "%s_%05d_l", prefix, count); | ||
| save_image(ls, buff); | ||
| sprintf(buff, "%s_%05d_r", prefix, count); | ||
| save_image(rs, buff); | ||
| free_image(l); | ||
| free_image(r); | ||
| free_image(ls); | ||
| free_image(rs); | ||
| ++count; | ||
| } | ||
|
|
||
| #else | ||
| printf("need OpenCV for extraction\n"); | ||
| #endif | ||
| } | ||
|
|
||
| void train_voxel(char *cfgfile, char *weightfile) | ||
| { | ||
| char *train_images = "/data/imagenet/imagenet1k.train.list"; | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| float avg_loss = -1; | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int imgs = net->batch*net->subdivisions; | ||
| int i = *net->seen/imgs; | ||
| data train, buffer; | ||
|
|
||
|
|
||
| list *plist = get_paths(train_images); | ||
| //int N = plist->size; | ||
| char **paths = (char **)list_to_array(plist); | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net->w; | ||
| args.h = net->h; | ||
| args.scale = 4; | ||
| args.paths = paths; | ||
| args.n = imgs; | ||
| args.m = plist->size; | ||
| args.d = &buffer; | ||
| args.type = SUPER_DATA; | ||
|
|
||
| pthread_t load_thread = load_data_in_thread(args); | ||
| clock_t time; | ||
| //while(i*imgs < N*120){ | ||
| while(get_current_batch(net) < net->max_batches){ | ||
| i += 1; | ||
| time=clock(); | ||
| pthread_join(load_thread, 0); | ||
| train = buffer; | ||
| load_thread = load_data_in_thread(args); | ||
|
|
||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
|
|
||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| if (avg_loss < 0) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
|
|
||
| printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); | ||
| if(i%1000==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.backup", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
| free_data(train); | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_final.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
|
|
||
| void test_voxel(char *cfgfile, char *weightfile, char *filename) | ||
| { | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
| srand(2222222); | ||
|
|
||
| clock_t time; | ||
| char buff[256]; | ||
| char *input = buff; | ||
| while(1){ | ||
| if(filename){ | ||
| strncpy(input, filename, 256); | ||
| }else{ | ||
| printf("Enter Image Path: "); | ||
| fflush(stdout); | ||
| input = fgets(input, 256, stdin); | ||
| if(!input) return; | ||
| strtok(input, "\n"); | ||
| } | ||
| image im = load_image_color(input, 0, 0); | ||
| resize_network(&net, im.w, im.h); | ||
| printf("%d %d\n", im.w, im.h); | ||
|
|
||
| float *X = im.data; | ||
| time=clock(); | ||
| network_predict(net, X); | ||
| image out = get_network_image(net); | ||
| printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); | ||
| save_image(out, "out"); | ||
|
|
||
| free_image(im); | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| void run_voxel(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| char *filename = (argc > 5) ? argv[5] : 0; | ||
| if(0==strcmp(argv[2], "train")) train_voxel(cfg, weights); | ||
| else if(0==strcmp(argv[2], "test")) test_voxel(cfg, weights, filename); | ||
| else if(0==strcmp(argv[2], "extract")) extract_voxel(argv[3], argv[4], argv[5]); | ||
| /* | ||
| else if(0==strcmp(argv[2], "valid")) validate_voxel(cfg, weights); | ||
| */ | ||
| } |
| @@ -0,0 +1,144 @@ | ||
| #include "darknet.h" | ||
|
|
||
| void train_writing(char *cfgfile, char *weightfile) | ||
| { | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| srand(time(0)); | ||
| float avg_loss = -1; | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int imgs = net->batch*net->subdivisions; | ||
| list *plist = get_paths("figures.list"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| clock_t time; | ||
| int N = plist->size; | ||
| printf("N: %d\n", N); | ||
| image out = get_network_image(net); | ||
|
|
||
| data train, buffer; | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net->w; | ||
| args.h = net->h; | ||
| args.out_w = out.w; | ||
| args.out_h = out.h; | ||
| args.paths = paths; | ||
| args.n = imgs; | ||
| args.m = N; | ||
| args.d = &buffer; | ||
| args.type = WRITING_DATA; | ||
|
|
||
| pthread_t load_thread = load_data_in_thread(args); | ||
| int epoch = (*net->seen)/N; | ||
| while(get_current_batch(net) < net->max_batches || net->max_batches == 0){ | ||
| time=clock(); | ||
| pthread_join(load_thread, 0); | ||
| train = buffer; | ||
| load_thread = load_data_in_thread(args); | ||
| printf("Loaded %lf seconds\n",sec(clock()-time)); | ||
|
|
||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
|
|
||
| /* | ||
| image pred = float_to_image(64, 64, 1, out); | ||
| print_image(pred); | ||
| */ | ||
|
|
||
| /* | ||
| image im = float_to_image(256, 256, 3, train.X.vals[0]); | ||
| image lab = float_to_image(64, 64, 1, train.y.vals[0]); | ||
| image pred = float_to_image(64, 64, 1, out); | ||
| show_image(im, "image"); | ||
| show_image(lab, "label"); | ||
| print_image(lab); | ||
| show_image(pred, "pred"); | ||
| cvWaitKey(0); | ||
| */ | ||
|
|
||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen); | ||
| free_data(train); | ||
| if(get_current_batch(net)%100 == 0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_batch_%ld.weights", backup_directory, base, get_current_batch(net)); | ||
| save_weights(net, buff); | ||
| } | ||
| if(*net->seen/N > epoch){ | ||
| epoch = *net->seen/N; | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void test_writing(char *cfgfile, char *weightfile, char *filename) | ||
| { | ||
| network *net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(net, weightfile); | ||
| } | ||
| set_batch_network(net, 1); | ||
| srand(2222222); | ||
| clock_t time; | ||
| char buff[256]; | ||
| char *input = buff; | ||
| while(1){ | ||
| if(filename){ | ||
| strncpy(input, filename, 256); | ||
| }else{ | ||
| printf("Enter Image Path: "); | ||
| fflush(stdout); | ||
| input = fgets(input, 256, stdin); | ||
| if(!input) return; | ||
| strtok(input, "\n"); | ||
| } | ||
|
|
||
| image im = load_image_color(input, 0, 0); | ||
| resize_network(net, im.w, im.h); | ||
| printf("%d %d %d\n", im.h, im.w, im.c); | ||
| float *X = im.data; | ||
| time=clock(); | ||
| network_predict(net, X); | ||
| printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); | ||
| image pred = get_network_image(net); | ||
|
|
||
| image upsampled = resize_image(pred, im.w, im.h); | ||
| image thresh = threshold_image(upsampled, .5); | ||
| pred = thresh; | ||
|
|
||
| show_image(pred, "prediction"); | ||
| show_image(im, "orig"); | ||
| #ifdef OPENCV | ||
| cvWaitKey(0); | ||
| cvDestroyAllWindows(); | ||
| #endif | ||
|
|
||
| free_image(upsampled); | ||
| free_image(thresh); | ||
| free_image(im); | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
| void run_writing(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| char *filename = (argc > 5) ? argv[5] : 0; | ||
| if(0==strcmp(argv[2], "train")) train_writing(cfg, weights); | ||
| else if(0==strcmp(argv[2], "test")) test_writing(cfg, weights, filename); | ||
| } | ||
|
|
| @@ -0,0 +1,200 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "activations.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
|
|
||
| __device__ float lhtan_activate_kernel(float x) | ||
| { | ||
| if(x < 0) return .001f*x; | ||
| if(x > 1) return .001f*(x-1.f) + 1.f; | ||
| return x; | ||
| } | ||
| __device__ float lhtan_gradient_kernel(float x) | ||
| { | ||
| if(x > 0 && x < 1) return 1; | ||
| return .001; | ||
| } | ||
|
|
||
| __device__ float hardtan_activate_kernel(float x) | ||
| { | ||
| if (x < -1) return -1; | ||
| if (x > 1) return 1; | ||
| return x; | ||
| } | ||
| __device__ float linear_activate_kernel(float x){return x;} | ||
| __device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));} | ||
| __device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;} | ||
| __device__ float relu_activate_kernel(float x){return x*(x>0);} | ||
| __device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);} | ||
| __device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;} | ||
| __device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;} | ||
| __device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;} | ||
| __device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);} | ||
| __device__ float plse_activate_kernel(float x) | ||
| { | ||
| if(x < -4) return .01f * (x + 4); | ||
| if(x > 4) return .01f * (x - 4) + 1; | ||
| return .125f*x + .5f; | ||
| } | ||
| __device__ float stair_activate_kernel(float x) | ||
| { | ||
| int n = floorf(x); | ||
| if (n%2 == 0) return floorf(x/2); | ||
| else return (x - n) + floorf(x/2); | ||
| } | ||
|
|
||
|
|
||
| __device__ float hardtan_gradient_kernel(float x) | ||
| { | ||
| if (x > -1 && x < 1) return 1; | ||
| return 0; | ||
| } | ||
| __device__ float linear_gradient_kernel(float x){return 1;} | ||
| __device__ float logistic_gradient_kernel(float x){return (1-x)*x;} | ||
| __device__ float loggy_gradient_kernel(float x) | ||
| { | ||
| float y = (x+1)/2; | ||
| return 2*(1-y)*y; | ||
| } | ||
| __device__ float relu_gradient_kernel(float x){return (x>0);} | ||
| __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);} | ||
| __device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;} | ||
| __device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;} | ||
| __device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;} | ||
| __device__ float tanh_gradient_kernel(float x){return 1-x*x;} | ||
| __device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;} | ||
| __device__ float stair_gradient_kernel(float x) | ||
| { | ||
| if (floorf(x) == x) return 0; | ||
| return 1; | ||
| } | ||
|
|
||
| __device__ float activate_kernel(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_activate_kernel(x); | ||
| case LOGISTIC: | ||
| return logistic_activate_kernel(x); | ||
| case LOGGY: | ||
| return loggy_activate_kernel(x); | ||
| case RELU: | ||
| return relu_activate_kernel(x); | ||
| case ELU: | ||
| return elu_activate_kernel(x); | ||
| case RELIE: | ||
| return relie_activate_kernel(x); | ||
| case RAMP: | ||
| return ramp_activate_kernel(x); | ||
| case LEAKY: | ||
| return leaky_activate_kernel(x); | ||
| case TANH: | ||
| return tanh_activate_kernel(x); | ||
| case PLSE: | ||
| return plse_activate_kernel(x); | ||
| case STAIR: | ||
| return stair_activate_kernel(x); | ||
| case HARDTAN: | ||
| return hardtan_activate_kernel(x); | ||
| case LHTAN: | ||
| return lhtan_activate_kernel(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| __device__ float gradient_kernel(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_gradient_kernel(x); | ||
| case LOGISTIC: | ||
| return logistic_gradient_kernel(x); | ||
| case LOGGY: | ||
| return loggy_gradient_kernel(x); | ||
| case RELU: | ||
| return relu_gradient_kernel(x); | ||
| case ELU: | ||
| return elu_gradient_kernel(x); | ||
| case RELIE: | ||
| return relie_gradient_kernel(x); | ||
| case RAMP: | ||
| return ramp_gradient_kernel(x); | ||
| case LEAKY: | ||
| return leaky_gradient_kernel(x); | ||
| case TANH: | ||
| return tanh_gradient_kernel(x); | ||
| case PLSE: | ||
| return plse_gradient_kernel(x); | ||
| case STAIR: | ||
| return stair_gradient_kernel(x); | ||
| case HARDTAN: | ||
| return hardtan_gradient_kernel(x); | ||
| case LHTAN: | ||
| return lhtan_gradient_kernel(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| __global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| int i = id % s; | ||
| int b = id / s; | ||
| float x1 = x[b*s + i]; | ||
| float x2 = x[b*s + s/2 + i]; | ||
| if(id < n) { | ||
| float de = dy[id]; | ||
| dx[b*s + i] = x2*de; | ||
| dx[b*s + s/2 + i] = x1*de; | ||
| } | ||
| } | ||
|
|
||
| extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y) | ||
| { | ||
| binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
| __global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| int i = id % s; | ||
| int b = id / s; | ||
| float x1 = x[b*s + i]; | ||
| float x2 = x[b*s + s/2 + i]; | ||
| if(id < n) y[id] = x1*x2; | ||
| } | ||
|
|
||
| extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y) | ||
| { | ||
| binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| __global__ void activate_array_kernel(float *x, int n, ACTIVATION a) | ||
| { | ||
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(i < n) x[i] = activate_kernel(x[i], a); | ||
| } | ||
|
|
||
| __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta) | ||
| { | ||
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(i < n) delta[i] *= gradient_kernel(x[i], a); | ||
| } | ||
|
|
||
| extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a) | ||
| { | ||
| activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) | ||
| { | ||
| gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta); | ||
| check_error(cudaPeekAtLastError()); | ||
| } |
| @@ -0,0 +1,61 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "avgpool_layer.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| __global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= n) return; | ||
|
|
||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| int i; | ||
| int out_index = (k + c*b); | ||
| output[out_index] = 0; | ||
| for(i = 0; i < w*h; ++i){ | ||
| int in_index = i + h*w*(k + b*c); | ||
| output[out_index] += input[in_index]; | ||
| } | ||
| output[out_index] /= w*h; | ||
| } | ||
|
|
||
| __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= n) return; | ||
|
|
||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| int i; | ||
| int out_index = (k + c*b); | ||
| for(i = 0; i < w*h; ++i){ | ||
| int in_index = i + h*w*(k + b*c); | ||
| in_delta[in_index] += out_delta[out_index] / (w*h); | ||
| } | ||
| } | ||
|
|
||
| extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net) | ||
| { | ||
| size_t n = layer.c*layer.batch; | ||
|
|
||
| forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net) | ||
| { | ||
| size_t n = layer.c*layer.batch; | ||
|
|
||
| backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
| @@ -0,0 +1,59 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
| #include "darknet.h" | ||
|
|
||
| extern "C" { | ||
| #include "col2im.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu | ||
| // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE | ||
|
|
||
| __global__ void col2im_gpu_kernel(const int n, const float* data_col, | ||
| const int height, const int width, const int ksize, | ||
| const int pad, | ||
| const int stride, | ||
| const int height_col, const int width_col, | ||
| float *data_im) { | ||
| int index = blockIdx.x*blockDim.x+threadIdx.x; | ||
| for(; index < n; index += blockDim.x*gridDim.x){ | ||
| float val = 0; | ||
| int w = index % width + pad; | ||
| int h = (index / width) % height + pad; | ||
| int c = index / (width * height); | ||
| // compute the start and end of the output | ||
| int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; | ||
| int w_col_end = min(w / stride + 1, width_col); | ||
| int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; | ||
| int h_col_end = min(h / stride + 1, height_col); | ||
| // equivalent implementation | ||
| int offset = | ||
| (c * ksize * ksize + h * ksize + w) * height_col * width_col; | ||
| int coeff_h_col = (1 - stride * ksize * height_col) * width_col; | ||
| int coeff_w_col = (1 - stride * height_col * width_col); | ||
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { | ||
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { | ||
| val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; | ||
| } | ||
| } | ||
| data_im[index] += val; | ||
| } | ||
| } | ||
|
|
||
| void col2im_gpu(float *data_col, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float *data_im){ | ||
| // We are going to launch channels * height_col * width_col kernels, each | ||
| // kernel responsible for copying a single-channel grid. | ||
| int height_col = (height + 2 * pad - ksize) / stride + 1; | ||
| int width_col = (width + 2 * pad - ksize) / stride + 1; | ||
| int num_kernels = channels * height * width; | ||
| col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK, | ||
| BLOCK>>>( | ||
| num_kernels, data_col, height, width, ksize, pad, | ||
| stride, height_col, | ||
| width_col, data_im); | ||
| } | ||
|
|
| @@ -0,0 +1,330 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "convolutional_layer.h" | ||
| #include "batchnorm_layer.h" | ||
| #include "gemm.h" | ||
| #include "blas.h" | ||
| #include "im2col.h" | ||
| #include "col2im.h" | ||
| #include "utils.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| __global__ void binarize_kernel(float *x, int n, float *binary) | ||
| { | ||
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if (i >= n) return; | ||
| binary[i] = (x[i] >= 0) ? 1 : -1; | ||
| } | ||
|
|
||
| void binarize_gpu(float *x, int n, float *binary) | ||
| { | ||
| binarize_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, binary); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| __global__ void binarize_input_kernel(float *input, int n, int size, float *binary) | ||
| { | ||
| int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if (s >= size) return; | ||
| int i = 0; | ||
| float mean = 0; | ||
| for(i = 0; i < n; ++i){ | ||
| mean += fabsf(input[i*size + s]); | ||
| } | ||
| mean = mean / n; | ||
| for(i = 0; i < n; ++i){ | ||
| binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean; | ||
| } | ||
| } | ||
|
|
||
| void binarize_input_gpu(float *input, int n, int size, float *binary) | ||
| { | ||
| binarize_input_kernel<<<cuda_gridsize(size), BLOCK>>>(input, n, size, binary); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
|
|
||
| __global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary) | ||
| { | ||
| int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if (f >= n) return; | ||
| int i = 0; | ||
| float mean = 0; | ||
| for(i = 0; i < size; ++i){ | ||
| mean += fabsf(weights[f*size + i]); | ||
| } | ||
| mean = mean / size; | ||
| for(i = 0; i < size; ++i){ | ||
| binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean; | ||
| /*binary[f*size + i] = weights[f*size + i];*/ | ||
| } | ||
| } | ||
|
|
||
| void binarize_weights_gpu(float *weights, int n, int size, float *binary) | ||
| { | ||
| binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| void forward_convolutional_layer_gpu(convolutional_layer l, network net) | ||
| { | ||
| fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); | ||
| if(l.binary){ | ||
| binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu); | ||
| swap_binary(&l); | ||
| } | ||
|
|
||
| if(l.xnor){ | ||
| binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu); | ||
| swap_binary(&l); | ||
| binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu); | ||
| net.input_gpu = l.binary_input_gpu; | ||
| } | ||
|
|
||
| #ifdef CUDNN | ||
| float one = 1; | ||
| cudnnConvolutionForward(cudnn_handle(), | ||
| &one, | ||
| l.srcTensorDesc, | ||
| net.input_gpu, | ||
| l.weightDesc, | ||
| l.weights_gpu, | ||
| l.convDesc, | ||
| l.fw_algo, | ||
| net.workspace, | ||
| l.workspace_size, | ||
| &one, | ||
| l.dstTensorDesc, | ||
| l.output_gpu); | ||
|
|
||
| #else | ||
| int i, j; | ||
| int m = l.n/l.groups; | ||
| int k = l.size*l.size*l.c/l.groups; | ||
| int n = l.out_w*l.out_h; | ||
| for(i = 0; i < l.batch; ++i){ | ||
| for(j = 0; j < l.groups; ++j){ | ||
| float *a = l.weights_gpu + j*l.nweights/l.groups; | ||
| float *b = net.workspace; | ||
| float *c = l.output_gpu + (i*l.groups + j)*n*m; | ||
| float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w; | ||
|
|
||
| if (l.size == 1){ | ||
| b = im; | ||
| } else { | ||
| im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); | ||
| } | ||
| gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n); | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| if (l.batch_normalize) { | ||
| forward_batchnorm_layer_gpu(l, net); | ||
| } else { | ||
| add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); | ||
| } | ||
|
|
||
| activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); | ||
| //if(l.dot > 0) dot_error_gpu(l); | ||
| if(l.binary || l.xnor) swap_binary(&l); | ||
| } | ||
|
|
||
| __global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= n) return; | ||
|
|
||
| int j = id % w; | ||
| id /= w; | ||
| int i = id % h; | ||
| id /= h; | ||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| int w_offset = -(size/2.f); | ||
| int h_offset = -(size/2.f); | ||
|
|
||
| int out_index = j + w*(i + h*(k + c*b)); | ||
| int l, m; | ||
| for(l = 0; l < size; ++l){ | ||
| for(m = 0; m < size; ++m){ | ||
| int cur_h = h_offset + i + l; | ||
| int cur_w = w_offset + j + m; | ||
| int index = cur_w + w*(cur_h + h*(k + b*c)); | ||
| int valid = (cur_h >= 0 && cur_h < h && | ||
| cur_w >= 0 && cur_w < w); | ||
| delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| extern "C" void smooth_layer(layer l, int size, float rate) | ||
| { | ||
| int h = l.out_h; | ||
| int w = l.out_w; | ||
| int c = l.out_c; | ||
|
|
||
| size_t n = h*w*c*l.batch; | ||
|
|
||
| smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| void backward_convolutional_layer_gpu(convolutional_layer l, network net) | ||
| { | ||
| if(l.smooth){ | ||
| smooth_layer(l, 5, l.smooth); | ||
| } | ||
| //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1); | ||
| gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); | ||
|
|
||
|
|
||
| if(l.batch_normalize){ | ||
| backward_batchnorm_layer_gpu(l, net); | ||
| } else { | ||
| backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); | ||
| } | ||
| float *original_input = net.input_gpu; | ||
|
|
||
| if(l.xnor) net.input_gpu = l.binary_input_gpu; | ||
| #ifdef CUDNN | ||
| float one = 1; | ||
| cudnnConvolutionBackwardFilter(cudnn_handle(), | ||
| &one, | ||
| l.srcTensorDesc, | ||
| net.input_gpu, | ||
| l.ddstTensorDesc, | ||
| l.delta_gpu, | ||
| l.convDesc, | ||
| l.bf_algo, | ||
| net.workspace, | ||
| l.workspace_size, | ||
| &one, | ||
| l.dweightDesc, | ||
| l.weight_updates_gpu); | ||
|
|
||
| if(net.delta_gpu){ | ||
| if(l.binary || l.xnor) swap_binary(&l); | ||
| cudnnConvolutionBackwardData(cudnn_handle(), | ||
| &one, | ||
| l.weightDesc, | ||
| l.weights_gpu, | ||
| l.ddstTensorDesc, | ||
| l.delta_gpu, | ||
| l.convDesc, | ||
| l.bd_algo, | ||
| net.workspace, | ||
| l.workspace_size, | ||
| &one, | ||
| l.dsrcTensorDesc, | ||
| net.delta_gpu); | ||
| if(l.binary || l.xnor) swap_binary(&l); | ||
| if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu); | ||
| } | ||
|
|
||
| #else | ||
| int m = l.n/l.groups; | ||
| int n = l.size*l.size*l.c/l.groups; | ||
| int k = l.out_w*l.out_h; | ||
|
|
||
| int i, j; | ||
| for(i = 0; i < l.batch; ++i){ | ||
| for(j = 0; j < l.groups; ++j){ | ||
| float *a = l.delta_gpu + (i*l.groups + j)*m*k; | ||
| float *b = net.workspace; | ||
| float *c = l.weight_updates_gpu + j*l.nweights/l.groups; | ||
|
|
||
| float *im = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w; | ||
| float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w; | ||
|
|
||
| im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); | ||
| gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n); | ||
|
|
||
| if (net.delta_gpu) { | ||
| if (l.binary || l.xnor) swap_binary(&l); | ||
| a = l.weights_gpu + j*l.nweights/l.groups; | ||
| b = l.delta_gpu + (i*l.groups + j)*m*k; | ||
| c = net.workspace; | ||
| if (l.size == 1) { | ||
| c = imd; | ||
| } | ||
|
|
||
| gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k); | ||
|
|
||
| if (l.size != 1) { | ||
| col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd); | ||
| } | ||
| if(l.binary || l.xnor) { | ||
| swap_binary(&l); | ||
| } | ||
| } | ||
| if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w); | ||
| } | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
| void pull_convolutional_layer(layer l) | ||
| { | ||
| cuda_pull_array(l.weights_gpu, l.weights, l.nweights); | ||
| cuda_pull_array(l.biases_gpu, l.biases, l.n); | ||
| cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights); | ||
| cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n); | ||
| if (l.batch_normalize){ | ||
| cuda_pull_array(l.scales_gpu, l.scales, l.n); | ||
| cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n); | ||
| cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n); | ||
| } | ||
| } | ||
|
|
||
| void push_convolutional_layer(layer l) | ||
| { | ||
| cuda_push_array(l.weights_gpu, l.weights, l.nweights); | ||
| cuda_push_array(l.biases_gpu, l.biases, l.n); | ||
| cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); | ||
| cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); | ||
| if (l.batch_normalize){ | ||
| cuda_push_array(l.scales_gpu, l.scales, l.n); | ||
| cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); | ||
| cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); | ||
| } | ||
| } | ||
|
|
||
| void update_convolutional_layer_gpu(layer l, update_args a) | ||
| { | ||
| float learning_rate = a.learning_rate*l.learning_rate_scale; | ||
| float momentum = a.momentum; | ||
| float decay = a.decay; | ||
| int batch = a.batch; | ||
|
|
||
| if(a.adam){ | ||
| adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); | ||
| adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); | ||
| if(l.scales_gpu){ | ||
| adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); | ||
| } | ||
| }else{ | ||
| axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); | ||
| axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); | ||
| scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1); | ||
|
|
||
| axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); | ||
| scal_gpu(l.n, momentum, l.bias_updates_gpu, 1); | ||
|
|
||
| if(l.scales_gpu){ | ||
| axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); | ||
| scal_gpu(l.n, momentum, l.scale_updates_gpu, 1); | ||
| } | ||
| } | ||
| if(l.clip){ | ||
| constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1); | ||
| } | ||
| } | ||
|
|
||
|
|
| @@ -0,0 +1,230 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "crop_layer.h" | ||
| #include "utils.h" | ||
| #include "cuda.h" | ||
| #include "image.h" | ||
| } | ||
|
|
||
| __device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c) | ||
| { | ||
| if(x < 0 || x >= w || y < 0 || y >= h) return 0; | ||
| return image[x + w*(y + c*h)]; | ||
| } | ||
|
|
||
| __device__ float3 rgb_to_hsv_kernel(float3 rgb) | ||
| { | ||
| float r = rgb.x; | ||
| float g = rgb.y; | ||
| float b = rgb.z; | ||
|
|
||
| float h, s, v; | ||
| float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b); | ||
| float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b); | ||
| float delta = max - min; | ||
| v = max; | ||
| if(max == 0){ | ||
| s = 0; | ||
| h = -1; | ||
| }else{ | ||
| s = delta/max; | ||
| if(r == max){ | ||
| h = (g - b) / delta; | ||
| } else if (g == max) { | ||
| h = 2 + (b - r) / delta; | ||
| } else { | ||
| h = 4 + (r - g) / delta; | ||
| } | ||
| if (h < 0) h += 6; | ||
| } | ||
| return make_float3(h, s, v); | ||
| } | ||
|
|
||
| __device__ float3 hsv_to_rgb_kernel(float3 hsv) | ||
| { | ||
| float h = hsv.x; | ||
| float s = hsv.y; | ||
| float v = hsv.z; | ||
|
|
||
| float r, g, b; | ||
| float f, p, q, t; | ||
|
|
||
| if (s == 0) { | ||
| r = g = b = v; | ||
| } else { | ||
| int index = (int) floorf(h); | ||
| f = h - index; | ||
| p = v*(1-s); | ||
| q = v*(1-s*f); | ||
| t = v*(1-s*(1-f)); | ||
| if(index == 0){ | ||
| r = v; g = t; b = p; | ||
| } else if(index == 1){ | ||
| r = q; g = v; b = p; | ||
| } else if(index == 2){ | ||
| r = p; g = v; b = t; | ||
| } else if(index == 3){ | ||
| r = p; g = q; b = v; | ||
| } else if(index == 4){ | ||
| r = t; g = p; b = v; | ||
| } else { | ||
| r = v; g = p; b = q; | ||
| } | ||
| } | ||
| r = (r < 0) ? 0 : ((r > 1) ? 1 : r); | ||
| g = (g < 0) ? 0 : ((g > 1) ? 1 : g); | ||
| b = (b < 0) ? 0 : ((b > 1) ? 1 : b); | ||
| return make_float3(r, g, b); | ||
| } | ||
|
|
||
| __device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c) | ||
| { | ||
| int ix = (int) floorf(x); | ||
| int iy = (int) floorf(y); | ||
|
|
||
| float dx = x - ix; | ||
| float dy = y - iy; | ||
|
|
||
| float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) + | ||
| dy * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) + | ||
| (1-dy) * dx * get_pixel_kernel(image, w, h, ix+1, iy, c) + | ||
| dy * dx * get_pixel_kernel(image, w, h, ix+1, iy+1, c); | ||
| return val; | ||
| } | ||
|
|
||
| __global__ void levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift) | ||
| { | ||
| int size = batch * w * h; | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= size) return; | ||
| int x = id % w; | ||
| id /= w; | ||
| int y = id % h; | ||
| id /= h; | ||
| float rshift = rand[0]; | ||
| float gshift = rand[1]; | ||
| float bshift = rand[2]; | ||
| float r0 = rand[8*id + 0]; | ||
| float r1 = rand[8*id + 1]; | ||
| float r2 = rand[8*id + 2]; | ||
| float r3 = rand[8*id + 3]; | ||
|
|
||
| saturation = r0*(saturation - 1) + 1; | ||
| saturation = (r1 > .5f) ? 1.f/saturation : saturation; | ||
| exposure = r2*(exposure - 1) + 1; | ||
| exposure = (r3 > .5f) ? 1.f/exposure : exposure; | ||
|
|
||
| size_t offset = id * h * w * 3; | ||
| image += offset; | ||
| float r = image[x + w*(y + h*0)]; | ||
| float g = image[x + w*(y + h*1)]; | ||
| float b = image[x + w*(y + h*2)]; | ||
| float3 rgb = make_float3(r,g,b); | ||
| if(train){ | ||
| float3 hsv = rgb_to_hsv_kernel(rgb); | ||
| hsv.y *= saturation; | ||
| hsv.z *= exposure; | ||
| rgb = hsv_to_rgb_kernel(hsv); | ||
| } else { | ||
| shift = 0; | ||
| } | ||
| image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift; | ||
| image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift; | ||
| image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift; | ||
| } | ||
|
|
||
| __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= size) return; | ||
|
|
||
| float cx = w/2.f; | ||
| float cy = h/2.f; | ||
|
|
||
| int count = id; | ||
| int j = id % crop_width; | ||
| id /= crop_width; | ||
| int i = id % crop_height; | ||
| id /= crop_height; | ||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| float r4 = rand[8*b + 4]; | ||
| float r5 = rand[8*b + 5]; | ||
| float r6 = rand[8*b + 6]; | ||
| float r7 = rand[8*b + 7]; | ||
|
|
||
| float dw = (w - crop_width)*r4; | ||
| float dh = (h - crop_height)*r5; | ||
| flip = (flip && (r6 > .5f)); | ||
| angle = 2*angle*r7 - angle; | ||
| if(!train){ | ||
| dw = (w - crop_width)/2.f; | ||
| dh = (h - crop_height)/2.f; | ||
| flip = 0; | ||
| angle = 0; | ||
| } | ||
|
|
||
| input += w*h*c*b; | ||
|
|
||
| float x = (flip) ? w - dw - j - 1 : j + dw; | ||
| float y = i + dh; | ||
|
|
||
| float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx; | ||
| float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy; | ||
|
|
||
| output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); | ||
| } | ||
|
|
||
| extern "C" void forward_crop_layer_gpu(crop_layer layer, network net) | ||
| { | ||
| void cuda_random(float *x_gpu, size_t n); | ||
| float cuda_compare(float *x_gpu, float *x, size_t n, char *s); | ||
| dim3 cuda_gridsize(size_t n); | ||
| void check_error(cudaError_t status); | ||
|
|
||
| cuda_random(layer.rand_gpu, layer.batch*8); | ||
|
|
||
| float radians = layer.angle*3.14159265f/180.f; | ||
|
|
||
| float scale = 2; | ||
| float translate = -1; | ||
| if(layer.noadjust){ | ||
| scale = 1; | ||
| translate = 0; | ||
| } | ||
|
|
||
| int size = layer.batch * layer.w * layer.h; | ||
|
|
||
| levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift); | ||
| check_error(cudaPeekAtLastError()); | ||
|
|
||
| size = layer.batch*layer.c*layer.out_w*layer.out_h; | ||
|
|
||
| forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
|
|
||
| /* | ||
| cuda_pull_array(layer.output_gpu, layer.output, size); | ||
| image im = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 0*(size/layer.batch)); | ||
| image im2 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 1*(size/layer.batch)); | ||
| image im3 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 2*(size/layer.batch)); | ||
| translate_image(im, -translate); | ||
| scale_image(im, 1/scale); | ||
| translate_image(im2, -translate); | ||
| scale_image(im2, 1/scale); | ||
| translate_image(im3, -translate); | ||
| scale_image(im3, 1/scale); | ||
| show_image(im, "cropped"); | ||
| show_image(im2, "cropped2"); | ||
| show_image(im3, "cropped3"); | ||
| cvWaitKey(0); | ||
| */ | ||
| } | ||
|
|
| @@ -0,0 +1,139 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "convolutional_layer.h" | ||
| #include "deconvolutional_layer.h" | ||
| #include "batchnorm_layer.h" | ||
| #include "gemm.h" | ||
| #include "blas.h" | ||
| #include "im2col.h" | ||
| #include "col2im.h" | ||
| #include "utils.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| extern "C" void forward_deconvolutional_layer_gpu(layer l, network net) | ||
| { | ||
| int i; | ||
|
|
||
| int m = l.size*l.size*l.n; | ||
| int n = l.h*l.w; | ||
| int k = l.c; | ||
|
|
||
| fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); | ||
|
|
||
| for(i = 0; i < l.batch; ++i){ | ||
| float *a = l.weights_gpu; | ||
| float *b = net.input_gpu + i*l.c*l.h*l.w; | ||
| float *c = net.workspace; | ||
|
|
||
| gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n); | ||
|
|
||
| col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs); | ||
| } | ||
| if (l.batch_normalize) { | ||
| forward_batchnorm_layer_gpu(l, net); | ||
| } else { | ||
| add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); | ||
| } | ||
| activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation); | ||
| } | ||
|
|
||
| extern "C" void backward_deconvolutional_layer_gpu(layer l, network net) | ||
| { | ||
| int i; | ||
|
|
||
| //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1); | ||
| gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); | ||
|
|
||
| if(l.batch_normalize){ | ||
| backward_batchnorm_layer_gpu(l, net); | ||
| } else { | ||
| backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); | ||
| } | ||
|
|
||
| //if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float)); | ||
|
|
||
| for(i = 0; i < l.batch; ++i){ | ||
| int m = l.c; | ||
| int n = l.size*l.size*l.n; | ||
| int k = l.h*l.w; | ||
|
|
||
| float *a = net.input_gpu + i*m*k; | ||
| float *b = net.workspace; | ||
| float *c = l.weight_updates_gpu; | ||
|
|
||
| im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, | ||
| l.size, l.stride, l.pad, b); | ||
| gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n); | ||
|
|
||
| if(net.delta_gpu){ | ||
| int m = l.c; | ||
| int n = l.h*l.w; | ||
| int k = l.size*l.size*l.n; | ||
|
|
||
| float *a = l.weights_gpu; | ||
| float *b = net.workspace; | ||
| float *c = net.delta_gpu + i*n*m; | ||
|
|
||
| gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| extern "C" void pull_deconvolutional_layer(layer l) | ||
| { | ||
| cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size); | ||
| cuda_pull_array(l.biases_gpu, l.biases, l.n); | ||
| cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size); | ||
| cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n); | ||
| if (l.batch_normalize){ | ||
| cuda_pull_array(l.scales_gpu, l.scales, l.n); | ||
| cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n); | ||
| cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n); | ||
| } | ||
| } | ||
|
|
||
| extern "C" void push_deconvolutional_layer(layer l) | ||
| { | ||
| cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size); | ||
| cuda_push_array(l.biases_gpu, l.biases, l.n); | ||
| cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size); | ||
| cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); | ||
| if (l.batch_normalize){ | ||
| cuda_push_array(l.scales_gpu, l.scales, l.n); | ||
| cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); | ||
| cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); | ||
| } | ||
| } | ||
|
|
||
| void update_deconvolutional_layer_gpu(layer l, update_args a) | ||
| { | ||
| float learning_rate = a.learning_rate*l.learning_rate_scale; | ||
| float momentum = a.momentum; | ||
| float decay = a.decay; | ||
| int batch = a.batch; | ||
|
|
||
| if(a.adam){ | ||
| adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); | ||
| adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); | ||
| if(l.scales_gpu){ | ||
| adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); | ||
| } | ||
| }else{ | ||
| axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); | ||
| axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); | ||
| scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1); | ||
|
|
||
| axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); | ||
| scal_gpu(l.n, momentum, l.bias_updates_gpu, 1); | ||
|
|
||
| if(l.scales_gpu){ | ||
| axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); | ||
| scal_gpu(l.n, momentum, l.scale_updates_gpu, 1); | ||
| } | ||
| } | ||
| } | ||
|
|
| @@ -0,0 +1,47 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "dropout_layer.h" | ||
| #include "utils.h" | ||
| #include "cuda.h" | ||
| void check_error(cudaError_t status); | ||
| cublasHandle_t blas_handle(); | ||
| int *cuda_make_int_array(int *x, size_t n); | ||
| void cuda_random(float *x_gpu, size_t n); | ||
| float cuda_compare(float *x_gpu, float *x, size_t n, char *s); | ||
| dim3 cuda_gridsize(size_t n); | ||
| } | ||
|
|
||
| __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale; | ||
| } | ||
|
|
||
| void forward_dropout_layer_gpu(dropout_layer layer, network net) | ||
| { | ||
| if (!net.train) return; | ||
| int size = layer.inputs*layer.batch; | ||
| cuda_random(layer.rand_gpu, size); | ||
| /* | ||
| int i; | ||
| for(i = 0; i < size; ++i){ | ||
| layer.rand[i] = rand_uniform(); | ||
| } | ||
| cuda_push_array(layer.rand_gpu, layer.rand, size); | ||
| */ | ||
|
|
||
| yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| void backward_dropout_layer_gpu(dropout_layer layer, network net) | ||
| { | ||
| if(!net.delta_gpu) return; | ||
| int size = layer.inputs*layer.batch; | ||
|
|
||
| yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale); | ||
| check_error(cudaPeekAtLastError()); | ||
| } |
| @@ -0,0 +1,62 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "im2col.h" | ||
| #include "cuda.h" | ||
| #include "darknet.h" | ||
| } | ||
|
|
||
| // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu | ||
| // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE | ||
|
|
||
| __global__ void im2col_gpu_kernel(const int n, const float* data_im, | ||
| const int height, const int width, const int ksize, | ||
| const int pad, | ||
| const int stride, | ||
| const int height_col, const int width_col, | ||
| float *data_col) { | ||
| int index = blockIdx.x*blockDim.x+threadIdx.x; | ||
| for(; index < n; index += blockDim.x*gridDim.x){ | ||
| int w_out = index % width_col; | ||
| int h_index = index / width_col; | ||
| int h_out = h_index % height_col; | ||
| int channel_in = h_index / height_col; | ||
| int channel_out = channel_in * ksize * ksize; | ||
| int h_in = h_out * stride - pad; | ||
| int w_in = w_out * stride - pad; | ||
| float* data_col_ptr = data_col; | ||
| data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; | ||
| const float* data_im_ptr = data_im; | ||
| data_im_ptr += (channel_in * height + h_in) * width + w_in; | ||
| for (int i = 0; i < ksize; ++i) { | ||
| for (int j = 0; j < ksize; ++j) { | ||
| int h = h_in + i; | ||
| int w = w_in + j; | ||
|
|
||
| *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? | ||
| data_im_ptr[i * width + j] : 0; | ||
|
|
||
| //*data_col_ptr = data_im_ptr[ii * width + jj]; | ||
|
|
||
| data_col_ptr += height_col * width_col; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void im2col_gpu(float *im, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float *data_col){ | ||
| // We are going to launch channels * height_col * width_col kernels, each | ||
| // kernel responsible for copying a single-channel grid. | ||
| int height_col = (height + 2 * pad - ksize) / stride + 1; | ||
| int width_col = (width + 2 * pad - ksize) / stride + 1; | ||
| int num_kernels = channels * height_col * width_col; | ||
| im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK, | ||
| BLOCK>>>( | ||
| num_kernels, im, height, width, ksize, pad, | ||
| stride, height_col, | ||
| width_col, data_col); | ||
| } |