diff --git a/Debug/poor-mans-profiler.sh b/Debug/poor-mans-profiler.sh
index 4b03e5631004..ab06a1b66ab2 100755
--- a/Debug/poor-mans-profiler.sh
+++ b/Debug/poor-mans-profiler.sh
@@ -1,20 +1,12 @@
 #!/bin/bash
 #
-# Author: Pavel Kirienko <pavel.kirienko@zubax.com>
-#
 # Poor man's sampling profiler for NuttX.
 #
 # Usage: Install flamegraph.pl in your PATH, configure your .gdbinit, run the script with proper arguments and go
 #        have a coffee. When you're back, you'll see the flamegraph. Note that frequent calls to GDB significantly
 #        interfere with normal operation of the target, which means that you can't profile real-time tasks with it.
-#        For best results, ensure that the PC is not overloaded, the USB host controller to which the debugger is
-#        connected is not congested. You should also allow the current user to set negative nice values.
-#
-# The FlameGraph script can be downloaded from https://github.com/brendangregg/FlameGraph. Thanks Mr. Gregg.
 #
-# Requirements: ARM GDB with Python support. You can get one by downloading the sources from
-#               https://launchpad.net/gcc-arm-embedded and building them with correct flags.
-#               Note that Python support is not required if no per-task sampling is needed.
+# Requirements: ARM GDB with Python support
 #
 
 set -e
@@ -41,7 +33,7 @@ which flamegraph.pl > /dev/null || die "Install flamegraph.pl first"
 nsamples=0
 sleeptime=0.1    # Doctors recommend 7-8 hours a day
 taskname=
-elf=
+elf=$root/Build/px4fmu-v2_default.build/firmware.elf
 append=0
 fgfontsize=10
 fgwidth=1900
@@ -77,8 +69,6 @@ do
     shift
 done
 
-[[ -z "$elf" ]] && die "Please specify the ELF file location, e.g.: build_px4fmu-v4_default/src/firmware/nuttx/firmware_nuttx"
-
 #
 # Temporary files
 #
@@ -247,8 +237,8 @@ for s, f in sorted(stacks.items(), key=lambda (s, f): s):
 
 print('Total stack frames:', num_stack_frames, file=sys.stderr)
 print('Top consumers (distribution of the stack tops):', file=sys.stderr)
-for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:300]:
-    print('% 7.3f%%   ' % (100 * num / num_stack_frames), name, file=sys.stderr)
+for name,num in sorted(stack_tops.items(), key=lambda (name, num): num, reverse=True)[:10]:
+    print('% 5.1f%%   ' % (100 * num / num_stack_frames), name, file=sys.stderr)
 EOF
 
 cat $stacksfile | python /tmp/pmpn-folder.py > $foldfile
diff --git a/ROMFS/px4fmu_common/init.d/4012_quad_x_can b/ROMFS/px4fmu_common/init.d/4012_quad_x_can
index 01f90b4704a8..f6246fef8e75 100644
--- a/ROMFS/px4fmu_common/init.d/4012_quad_x_can
+++ b/ROMFS/px4fmu_common/init.d/4012_quad_x_can
@@ -25,5 +25,4 @@ then
 	param set MC_YAWRATE_D 0.0
 fi
 
-set MIXER quad_x_can
 set OUTPUT_MODE uavcan_esc
diff --git a/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix b/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix
deleted file mode 100644
index d6a29801481c..000000000000
--- a/ROMFS/px4fmu_common/mixers/quad_x_can.main.mix
+++ /dev/null
@@ -1 +0,0 @@
-R: 4x 10000 10000 10000 0
diff --git a/src/lib/matrix b/src/lib/matrix
index 8ef252767691..cf924956d7d6 160000
--- a/src/lib/matrix
+++ b/src/lib/matrix
@@ -1 +1 @@
-Subproject commit 8ef25276769127595c8cf27c2ee026a2954173cc
+Subproject commit cf924956d7d62ce18bfc4f8441e9177ddb69c0dc
diff --git a/src/modules/uavcan/actuators/esc.cpp b/src/modules/uavcan/actuators/esc.cpp
index 92d12ba32ca1..d24ad9a1faa3 100644
--- a/src/modules/uavcan/actuators/esc.cpp
+++ b/src/modules/uavcan/actuators/esc.cpp
@@ -136,25 +136,6 @@ void UavcanEscController::update_outputs(float *outputs, unsigned num_outputs)
 		}
 	}
 
-	/*
-	 * Remove channels that are always zero.
-	 * The objective of this optimization is to avoid broadcasting multi-frame transfers when a single frame
-	 * transfer would be enough. This is a valid optimization as the UAVCAN specification implies that all
-	 * non-specified ESC setpoints should be considered zero.
-	 * The positive outcome is a (marginally) lower bus traffic and lower CPU load.
-	 *
-	 * From the standpoint of the PX4 architecture, however, this is a hack. It should be investigated why
-	 * the mixer returns more outputs than are actually used.
-	 */
-	for (int index = int(msg.cmd.size()) - 1; index >= _max_number_of_nonzero_outputs; index--) {
-		if (msg.cmd[index] != 0) {
-			_max_number_of_nonzero_outputs = index + 1;
-			break;
-		}
-	}
-
-	msg.cmd.resize(_max_number_of_nonzero_outputs);
-
 	/*
 	 * Publish the command message to the bus
 	 * Note that for a quadrotor it takes one CAN frame
diff --git a/src/modules/uavcan/actuators/esc.hpp b/src/modules/uavcan/actuators/esc.hpp
index ced372f7c684..40b151e30825 100644
--- a/src/modules/uavcan/actuators/esc.hpp
+++ b/src/modules/uavcan/actuators/esc.hpp
@@ -107,7 +107,6 @@ class UavcanEscController
 	 * ESC states
 	 */
 	uint32_t 			_armed_mask = 0;
-	uint8_t				_max_number_of_nonzero_outputs = 0;
 
 	/*
 	 * Perf counters
diff --git a/src/modules/uavcan/uavcan_main.cpp b/src/modules/uavcan/uavcan_main.cpp
index e6bbc0d07827..204d8c79a3c3 100644
--- a/src/modules/uavcan/uavcan_main.cpp
+++ b/src/modules/uavcan/uavcan_main.cpp
@@ -110,6 +110,18 @@ UavcanNode::UavcanNode(uavcan::ICanDriver &can_driver, uavcan::ISystemClock &sys
 	}
 	/* _server_command_sem use case is a signal */
 	px4_sem_setprotocol(&_server_command_sem, SEM_PRIO_NONE);
+
+	if (_perfcnt_node_spin_elapsed == nullptr) {
+		errx(1, "uavcan: couldn't allocate _perfcnt_node_spin_elapsed");
+	}
+
+	if (_perfcnt_esc_mixer_output_elapsed == nullptr) {
+		errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_output_elapsed");
+	}
+
+	if (_perfcnt_esc_mixer_total_elapsed == nullptr) {
+		errx(1, "uavcan: couldn't allocate _perfcnt_esc_mixer_total_elapsed");
+	}
 }
 
 UavcanNode::~UavcanNode()
@@ -152,6 +164,9 @@ UavcanNode::~UavcanNode()
 
 	_instance = nullptr;
 
+	perf_free(_perfcnt_node_spin_elapsed);
+	perf_free(_perfcnt_esc_mixer_output_elapsed);
+	perf_free(_perfcnt_esc_mixer_total_elapsed);
 	pthread_mutex_destroy(&_node_mutex);
 	px4_sem_destroy(&_server_command_sem);
 
@@ -682,6 +697,7 @@ int UavcanNode::init(uavcan::NodeID node_id)
 
 void UavcanNode::node_spin_once()
 {
+	perf_begin(_perfcnt_node_spin_elapsed);
 	const int spin_res = _node.spinOnce();
 
 	if (spin_res < 0) {
@@ -692,6 +708,8 @@ void UavcanNode::node_spin_once()
 	if (_tx_injector != nullptr) {
 		_tx_injector->injectTxFramesInto(_node);
 	}
+
+	perf_end(_perfcnt_node_spin_elapsed);
 }
 
 /*
@@ -850,8 +868,12 @@ int UavcanNode::run()
 		// Mutex is unlocked while the thread is blocked on IO multiplexing
 		(void)pthread_mutex_unlock(&_node_mutex);
 
+		perf_end(_perfcnt_esc_mixer_total_elapsed); // end goes first, it's not a mistake
+
 		const int poll_ret = ::poll(_poll_fds, _poll_fds_num, PollTimeoutMs);
 
+		perf_begin(_perfcnt_esc_mixer_total_elapsed);
+
 		(void)pthread_mutex_lock(&_node_mutex);
 
 		node_spin_once();  // Non-blocking
@@ -943,7 +965,9 @@ int UavcanNode::run()
 
 			// Output to the bus
 			_outputs.timestamp = hrt_absolute_time();
+			perf_begin(_perfcnt_esc_mixer_output_elapsed);
 			_esc_controller.update_outputs(_outputs.output, _outputs.noutputs);
+			perf_end(_perfcnt_esc_mixer_output_elapsed);
 		}
 
 
diff --git a/src/modules/uavcan/uavcan_main.hpp b/src/modules/uavcan/uavcan_main.hpp
index f84dff1630c9..b962333a0608 100644
--- a/src/modules/uavcan/uavcan_main.hpp
+++ b/src/modules/uavcan/uavcan_main.hpp
@@ -209,6 +209,10 @@ class UavcanNode : public device::CDev
 	// index into _poll_fds for each _control_subs handle
 	uint8_t				_poll_ids[NUM_ACTUATOR_CONTROL_GROUPS_UAVCAN];
 
+	perf_counter_t _perfcnt_node_spin_elapsed		= perf_alloc(PC_ELAPSED, "uavcan_node_spin_elapsed");
+	perf_counter_t _perfcnt_esc_mixer_output_elapsed	= perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_output_elapsed");
+	perf_counter_t _perfcnt_esc_mixer_total_elapsed		= perf_alloc(PC_ELAPSED, "uavcan_esc_mixer_total_elapsed");
+
 	void handle_time_sync(const uavcan::TimerEvent &);
 
 	typedef uavcan::MethodBinder<UavcanNode *, void (UavcanNode::*)(const uavcan::TimerEvent &)> TimerCallback;
diff --git a/src/systemcmds/tests/test_matrix.cpp b/src/systemcmds/tests/test_matrix.cpp
index 029f36ceb54f..d2e574810f71 100644
--- a/src/systemcmds/tests/test_matrix.cpp
+++ b/src/systemcmds/tests/test_matrix.cpp
@@ -317,9 +317,9 @@ bool MatrixTest::filterTests()
 
 bool MatrixTest::helperTests()
 {
-	ut_test(::fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5);
-	ut_test(::fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5);
-	ut_test(::fabs(wrap_pi(3.0) - (3.0)) < 1e-3);
+	ut_test(fabs(wrap_pi(4.0) - (4.0 - 2 * M_PI)) < 1e-5);
+	ut_test(fabs(wrap_pi(-4.0) - (-4.0 + 2 * M_PI)) < 1e-5);
+	ut_test(fabs(wrap_pi(3.0) - (3.0)) < 1e-3);
 	wrap_pi(NAN);
 
 	Vector3f a(1, 2, 3);