diff --git a/src/layer/detectionoutput.cpp b/src/layer/detectionoutput.cpp
index e0adc699d03..e5143a3e140 100644
--- a/src/layer/detectionoutput.cpp
+++ b/src/layer/detectionoutput.cpp
@@ -33,6 +33,10 @@ int DetectionOutput::load_param(const ParamDict& pd)
     nms_top_k = pd.get(2, 300);
     keep_top_k = pd.get(3, 100);
     confidence_threshold = pd.get(4, 0.5f);
+    variances[0] = pd.get(5, 0.1f);
+    variances[1] = pd.get(6, 0.1f);
+    variances[2] = pd.get(7, 0.2f);
+    variances[3] = pd.get(8, 0.2f);
 
     return 0;
 }
@@ -161,14 +165,14 @@ int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
 
     const float* location_ptr = location;
     const float* priorbox_ptr = priorbox.row(0);
-    const float* variance_ptr = priorbox.row(1);
+    const float* variance_ptr = priorbox.h == 2 ? priorbox.row(1) : 0;
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int i = 0; i < num_prior; i++)
     {
         const float* loc = location_ptr + i * 4;
         const float* pb = priorbox_ptr + i * 4;
-        const float* var = variance_ptr + i * 4;
+        const float* var = variance_ptr ? variance_ptr + i * 4 : variances;
 
         float* bbox = bboxes.row(i);
 
diff --git a/src/layer/detectionoutput.h b/src/layer/detectionoutput.h
index 97486220e2f..02347da2012 100644
--- a/src/layer/detectionoutput.h
+++ b/src/layer/detectionoutput.h
@@ -34,6 +34,7 @@ class DetectionOutput : public Layer
     int nms_top_k;
     int keep_top_k;
     float confidence_threshold;
+    float variances[4];
 };
 
 } // namespace ncnn
diff --git a/src/layer/permute.cpp b/src/layer/permute.cpp
index fb558eea0fe..4a285d2d96f 100644
--- a/src/layer/permute.cpp
+++ b/src/layer/permute.cpp
@@ -38,6 +38,39 @@ int Permute::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
     int channels = bottom_blob.c;
     size_t elemsize = bottom_blob.elemsize;
 
+    int dims = bottom_blob.dims;
+
+    if (dims == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            top_blob = bottom_blob;
+        }
+        else if (order_type == 1)
+        {
+            top_blob.create(h, w, elemsize, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            const float* ptr = bottom_blob;
+            float* outptr = top_blob;
+
+            for (int i = 0; i < w; i++)
+            {
+                for (int j = 0; j < h; j++)
+                {
+                    outptr[i*h + j] = ptr[j*w + i];
+                }
+            }
+        }
+
+        return 0;
+    }
+
     // order_type
     // 0 = w h c
     // 1 = h w c
diff --git a/src/layer/priorbox.cpp b/src/layer/priorbox.cpp
index c9f68e17d79..2abd5fee02a 100644
--- a/src/layer/priorbox.cpp
+++ b/src/layer/priorbox.cpp
@@ -31,10 +31,10 @@ int PriorBox::load_param(const ParamDict& pd)
     min_sizes = pd.get(0, Mat());
     max_sizes = pd.get(1, Mat());
     aspect_ratios = pd.get(2, Mat());
-    variances[0] = pd.get(3, 0.f);
-    variances[1] = pd.get(4, 0.f);
-    variances[2] = pd.get(5, 0.f);
-    variances[3] = pd.get(6, 0.f);
+    variances[0] = pd.get(3, 0.1f);
+    variances[1] = pd.get(4, 0.1f);
+    variances[2] = pd.get(5, 0.2f);
+    variances[3] = pd.get(6, 0.2f);
     flip = pd.get(7, 1);
     clip = pd.get(8, 0);
     image_width = pd.get(9, 0);
@@ -51,6 +51,83 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     int w = bottom_blobs[0].w;
     int h = bottom_blobs[0].h;
 
+    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
+    {
+        // mxnet style _contrib_MultiBoxPrior
+        fprintf(stderr, "mxnet style _contrib_MultiBoxPrior\n");
+
+        float step_w = step_width;
+        float step_h = step_height;
+        if (step_w == -233)
+            step_w = 1.f / (float)w;
+        if (step_h == -233)
+            step_h = 1.f / (float)h;
+
+        int num_sizes = min_sizes.w;
+        int num_ratios = aspect_ratios.w;
+
+        int num_prior = num_sizes - 1 + num_ratios;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(4 * w * h * num_prior, 4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* box = (float*)top_blob + i * w * num_prior * 4;
+
+            float center_x = offset * step_w;
+            float center_y = offset * step_h + i * step_h;
+
+            for (int j = 0; j < w; j++)
+            {
+                // ratio = 1, various sizes
+                for (int k = 0; k < num_sizes; k++)
+                {
+                    float size = min_sizes[k];
+                    float cw = size * h / w / 2;
+                    float ch = size / 2;
+
+                    box[0] = center_x - cw;
+                    box[1] = center_y - ch;
+                    box[2] = center_x + cw;
+                    box[3] = center_y + ch;
+                    box += 4;
+                }
+
+                // various ratios, size = min_size = size[0]
+                float size = min_sizes[0];
+                for (int p = 1; p < num_ratios; p++)
+                {
+                    float ratio = sqrt(aspect_ratios[p]);
+                    float cw = size * h / w * ratio / 2;
+                    float ch = size / ratio / 2;
+
+                    box[0] = center_x - cw;
+                    box[1] = center_y - ch;
+                    box[2] = center_x + cw;
+                    box[3] = center_y + ch;
+                    box += 4;
+                }
+
+                center_x += step_w;
+            }
+        }
+
+        if (clip)
+        {
+            float* box = top_blob;
+            for (int i = 0; i < top_blob.w; i++)
+            {
+                box[i] = std::min(std::max(box[i], 0.f), 1.f);
+            }
+        }
+
+        return 0;
+    }
+
     int image_w = image_width;
     int image_h = image_height;
     if (image_w == -233)
@@ -69,26 +146,14 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     int num_max_size = max_sizes.w;
     int num_aspect_ratio = aspect_ratios.w;
 
-    Mat min_sizes_copy = min_sizes.clone();
-    Mat max_sizes_copy = max_sizes.clone();
-
-    for (int k = 0; k < num_min_size; k++)
-    {
-        if (min_sizes_copy[k] < 0.f)
-            min_sizes_copy[k] = -min_sizes_copy[k] * image_w;
-    }
-    for (int k = 0; k < num_max_size; k++)
-    {
-        if (max_sizes_copy[k] < 0.f)
-            max_sizes_copy[k] = -max_sizes_copy[k] * image_w;
-    }
-
     int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
     if (flip)
         num_prior += num_min_size * num_aspect_ratio;
 
     Mat& top_blob = top_blobs[0];
     top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int i = 0; i < h; i++)
@@ -105,7 +170,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 
             for (int k = 0; k < num_min_size; k++)
             {
-                float min_size = min_sizes_copy[k];
+                float min_size = min_sizes[k];
 
                 // min size box
                 box_w = box_h = min_size;
@@ -119,7 +184,7 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 
                 if (num_max_size > 0)
                 {
-                    float max_size = max_sizes_copy[k];
+                    float max_size = max_sizes[k];
 
                     // max size box
                     box_w = box_h = sqrt(min_size * max_size);
@@ -161,8 +226,6 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 
             center_x += step_w;
         }
-
-        center_y += step_h;
     }
 
     if (clip)
diff --git a/tools/mxnet/mxnet2ncnn.cpp b/tools/mxnet/mxnet2ncnn.cpp
index d2d12ee8929..6043302fd98 100644
--- a/tools/mxnet/mxnet2ncnn.cpp
+++ b/tools/mxnet/mxnet2ncnn.cpp
@@ -703,6 +703,10 @@ int main(int argc, char** argv)
             }
             continue;
         }
+        else if (n.op == "_contrib_MultiBoxTarget")
+        {
+            n.output_size = 3;
+        }
         else if (n.op == "SliceChannel")
         {
             n.output_size = n.attr("num_outputs");
@@ -775,6 +779,11 @@ int main(int argc, char** argv)
         }
     }
 
+//     for (std::map<int, int>::iterator it = node_reference.begin(); it != node_reference.end(); it++)
+//     {
+//         fprintf(stderr, "ref %d %d\n", it->first, it->second);
+//     }
+
     // op chain fusion
     int reduced_node_count = 0;
     for (int i=0; i<node_count; i++)
@@ -1277,42 +1286,40 @@ int main(int argc, char** argv)
             int keep_top_k = 100;
             fprintf(pp, " 3=%d", keep_top_k);
             fprintf(pp, " 4=%f", threshold);
+
+            std::vector<float> variances = n.attr("variances");
+            if (variances.empty())
+            {
+                fprintf(pp, " 5=0.1");
+                fprintf(pp, " 6=0.1");
+                fprintf(pp, " 7=0.2");
+                fprintf(pp, " 8=0.2");
+            }
+            else
+            {
+                fprintf(pp, " 5=%f", variances[0]);
+                fprintf(pp, " 6=%f", variances[1]);
+                fprintf(pp, " 7=%f", variances[2]);
+                fprintf(pp, " 8=%f", variances[3]);
+            }
         }
         else if (n.op == "_contrib_MultiBoxPrior")
         {
+            // mxnet-ssd encode size as scale factor, fill min_size
             std::vector<float> sizes = n.attr("sizes");
-            float min_size = sizes[0];
-            float max_size = sizes[1];
-
-            // mxnet-ssd encode size as scale factor
-            fprintf(pp, " -23300=%d", 1);
-            fprintf(pp, ",%f", -min_size);
-
-            fprintf(pp, " -23301=%d", 1);
-            fprintf(pp, ",%f", -max_size);
-
-            // drop 1.0 ratio
-            std::vector<float> ratios = n.attr("ratios");
-            std::vector<float> aspect_ratios;
-            for (int j=0; j<ratios.size(); j++)
+            fprintf(pp, " -23300=%d", (int)sizes.size());
+            for (int j=0; j<(int)sizes.size(); j++)
             {
-                if (ratios[j] == 1.f)
-                    continue;
-                aspect_ratios.push_back(ratios[j]);
+                fprintf(pp, ",%f", sizes[j]);
             }
 
+            std::vector<float> aspect_ratios = n.attr("ratios");
             fprintf(pp, " -23302=%d", (int)aspect_ratios.size());
             for (int j=0; j<(int)aspect_ratios.size(); j++)
             {
                 fprintf(pp, ",%f", aspect_ratios[j]);
             }
 
-            float variances[4] = {0.1f, 0.1f, 0.2f, 0.2f};
-            fprintf(pp, " 3=%f", variances[0]);
-            fprintf(pp, " 4=%f", variances[1]);
-            fprintf(pp, " 5=%f", variances[2]);
-            fprintf(pp, " 6=%f", variances[3]);
-
             int flip = 0;
             fprintf(pp, " 7=%d", flip);
 
@@ -1327,8 +1334,8 @@ int main(int argc, char** argv)
             if (steps.empty() || (steps[0] == -1.f && steps[1] == -1.f))
             {
                 // auto step
-                fprintf(pp, " 11=-233");
-                fprintf(pp, " 12=-233");
+                fprintf(pp, " 11=-233.0");
+                fprintf(pp, " 12=-233.0");
             }
             else
             {