diff --git a/3rdparty/ffmpeg b/3rdparty/ffmpeg
index 9a2df87789eb..10d0ebc0b8c7 160000
--- a/3rdparty/ffmpeg
+++ b/3rdparty/ffmpeg
@@ -1 +1 @@
-Subproject commit 9a2df87789ebfecf64d35d732e5847662fbd5520
+Subproject commit 10d0ebc0b8c7c4f0b242c9998c8bdc4e55bb5067
diff --git a/rpcs3/Emu/Audio/AudioBackend.h b/rpcs3/Emu/Audio/AudioBackend.h
index d6978ea66dd4..497b4c97e665 100644
--- a/rpcs3/Emu/Audio/AudioBackend.h
+++ b/rpcs3/Emu/Audio/AudioBackend.h
@@ -221,6 +221,45 @@ class AudioBackend
 		}
 	}
 
+	static void downmix(u32 sample_cnt, u32 src_ch_cnt, u32 dst_ch_cnt, const f32* src, f32* dst)
+	{
+		if (src_ch_cnt <= dst_ch_cnt)
+		{
+			return;
+		}
+
+		if (src_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_7_1))
+		{
+			if (dst_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_5_1))
+			{
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::SURROUND_5_1>(sample_cnt, src, dst);
+			}
+			else if (dst_ch_cnt == static_cast<u32>(AudioChannelCnt::STEREO))
+			{
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::STEREO>(sample_cnt, src, dst);
+			}
+			else
+			{
+				fmt::throw_exception("Invalid downmix combination: %u -> %u", src_ch_cnt, dst_ch_cnt);
+			}
+		}
+		else if (src_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_5_1))
+		{
+			if (dst_ch_cnt == static_cast<u32>(AudioChannelCnt::STEREO))
+			{
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_5_1, AudioChannelCnt::STEREO>(sample_cnt, src, dst);
+			}
+			else
+			{
+				fmt::throw_exception("Invalid downmix combination: %u -> %u", src_ch_cnt, dst_ch_cnt);
+			}
+		}
+		else
+		{
+			fmt::throw_exception("Invalid downmix combination: %u -> %u", src_ch_cnt, dst_ch_cnt);
+		}
+	}
+
 protected:
 	AudioSampleSize m_sample_size = AudioSampleSize::FLOAT;
 	AudioFreq       m_sampling_rate = AudioFreq::FREQ_48K;
diff --git a/rpcs3/Emu/Audio/audio_resampler.cpp b/rpcs3/Emu/Audio/audio_resampler.cpp
index c2d318db40d8..b029172dfaee 100644
--- a/rpcs3/Emu/Audio/audio_resampler.cpp
+++ b/rpcs3/Emu/Audio/audio_resampler.cpp
@@ -33,8 +33,7 @@ void audio_resampler::put_samples(const f32* buf, u32 sample_cnt)
 
 std::pair<f32* /* buffer */, u32 /* samples */> audio_resampler::get_samples(u32 sample_cnt)
 {
-	f32 *const buf = resampler.bufBegin();
-	return std::make_pair(buf, resampler.receiveSamples(sample_cnt));
+	return std::make_pair(resampler.bufBegin(), resampler.receiveSamples(sample_cnt));
 }
 
 u32 audio_resampler::samples_available() const
diff --git a/rpcs3/Emu/Cell/Modules/cellAudio.cpp b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
index c46b09b8a5ed..117c21ab71eb 100644
--- a/rpcs3/Emu/Cell/Modules/cellAudio.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
@@ -5,6 +5,7 @@
 #include "Emu/Cell/lv2/sys_process.h"
 #include "Emu/Cell/lv2/sys_event.h"
 #include "cellAudio.h"
+#include "util/video_provider.h"
 
 #include <cmath>
 
@@ -69,7 +70,7 @@ void cell_audio_config::reset(bool backend_changed)
 	const AudioFreq freq = AudioFreq::FREQ_48K;
 	const AudioSampleSize sample_size = raw.convert_to_s16 ? AudioSampleSize::S16 : AudioSampleSize::FLOAT;
 
-	const auto [req_ch_cnt, downmix] = AudioBackend::get_channel_count_and_downmixer(0); // CELL_AUDIO_OUT_PRIMARY
+	const auto& [req_ch_cnt, downmix] = AudioBackend::get_channel_count_and_downmixer(0); // CELL_AUDIO_OUT_PRIMARY
 	f64 cb_frame_len = 0.0;
 	u32 ch_cnt = 2;
 
@@ -276,52 +277,26 @@ void audio_ringbuffer::process_resampled_data()
 {
 	if (!cfg.time_stretching_enabled) return;
 
-	const auto [buffer, samples] = resampler.get_samples(static_cast<u32>(cb_ringbuf.get_free_size() / (cfg.audio_sample_size * static_cast<u32>(cfg.backend_ch_cnt))));
+	const auto& [buffer, samples] = resampler.get_samples(static_cast<u32>(cb_ringbuf.get_free_size() / (cfg.audio_sample_size * static_cast<u32>(cfg.backend_ch_cnt))));
 	commit_data(buffer, samples);
 }
 
 void audio_ringbuffer::commit_data(f32* buf, u32 sample_cnt)
 {
-	sample_cnt *= cfg.audio_channels;
+	const u32 sample_cnt_in = sample_cnt * cfg.audio_channels;
+	const u32 sample_cnt_out = sample_cnt * static_cast<u32>(cfg.backend_ch_cnt);
 
 	// Dump audio if enabled
-	m_dump.WriteData(buf, sample_cnt * static_cast<u32>(AudioSampleSize::FLOAT));
+	m_dump.WriteData(buf, sample_cnt_in * static_cast<u32>(AudioSampleSize::FLOAT));
 
-	if (cfg.backend_ch_cnt < AudioChannelCnt{cfg.audio_channels})
+	// Record audio if enabled
+	if (utils::video_provider& provider = g_fxo->get<utils::video_provider>(); provider.can_consume_sample())
 	{
-		if (AudioChannelCnt{cfg.audio_channels} == AudioChannelCnt::SURROUND_7_1)
-		{
-			if (cfg.backend_ch_cnt == AudioChannelCnt::SURROUND_5_1)
-			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::SURROUND_5_1>(sample_cnt, buf, buf);
-			}
-			else if (cfg.backend_ch_cnt == AudioChannelCnt::STEREO)
-			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::STEREO>(sample_cnt, buf, buf);
-			}
-			else
-			{
-				fmt::throw_exception("Invalid downmix combination: %u -> %u", cfg.audio_channels, static_cast<u32>(cfg.backend_ch_cnt));
-			}
-		}
-		else if (AudioChannelCnt{cfg.audio_channels} == AudioChannelCnt::SURROUND_5_1)
-		{
-			if (cfg.backend_ch_cnt == AudioChannelCnt::STEREO)
-			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_5_1, AudioChannelCnt::STEREO>(sample_cnt, buf, buf);
-			}
-			else
-			{
-				fmt::throw_exception("Invalid downmix combination: %u -> %u", cfg.audio_channels, static_cast<u32>(cfg.backend_ch_cnt));
-			}
-		}
-		else
-		{
-			fmt::throw_exception("Invalid downmix combination: %u -> %u", cfg.audio_channels, static_cast<u32>(cfg.backend_ch_cnt));
-		}
+		provider.present_samples(reinterpret_cast<u8*>(buf), sample_cnt, static_cast<u32>(cfg.audio_channels));
 	}
 
-	const u32 sample_cnt_out = sample_cnt / cfg.audio_channels * static_cast<u32>(cfg.backend_ch_cnt);
+	// Downmix if necessary
+	AudioBackend::downmix(sample_cnt_in, cfg.audio_channels, static_cast<u32>(cfg.backend_ch_cnt), buf, buf);
 
 	if (cfg.backend->get_convert_to_s16())
 	{
diff --git a/rpcs3/Emu/Cell/Modules/cellAudio.h b/rpcs3/Emu/Cell/Modules/cellAudio.h
index 0f2bfe7d523d..4225bdbca659 100644
--- a/rpcs3/Emu/Cell/Modules/cellAudio.h
+++ b/rpcs3/Emu/Cell/Modules/cellAudio.h
@@ -223,8 +223,8 @@ struct cell_audio_config
 
 	AudioChannelCnt audio_downmix = AudioChannelCnt::SURROUND_7_1;
 	AudioChannelCnt backend_ch_cnt = AudioChannelCnt::SURROUND_7_1;
-	u32 audio_channels = 0;
-	u32 audio_sampling_rate = 0;
+	u32 audio_channels = 2;
+	u32 audio_sampling_rate = DEFAULT_AUDIO_SAMPLING_RATE;
 	u32 audio_block_period = 0;
 	u32 audio_sample_size = 0;
 	f64 audio_min_buffer_duration = 0.0;
diff --git a/rpcs3/Emu/Cell/Modules/cellRec.cpp b/rpcs3/Emu/Cell/Modules/cellRec.cpp
index 53165ef6a5dd..c793fbca887f 100644
--- a/rpcs3/Emu/Cell/Modules/cellRec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellRec.cpp
@@ -5,6 +5,7 @@
 #include "Emu/IdManager.h"
 #include "Emu/system_config.h"
 #include "Emu/VFS.h"
+#include "Emu/Audio/AudioBackend.h"
 #include "cellRec.h"
 #include "cellSysutil.h"
 #include "util/media_utils.h"
@@ -136,36 +137,69 @@ struct rec_param
 			video_input, audio_input, audio_input_mix_vol, reduce_memsize, show_xmb, filename, metadata_filename, spurs_param.pSpurs, spurs_param.spu_usage_rate,
 			priority, movie_metadata.to_string(), scene_metadata.to_string());
 	}
+
+	bool use_external_audio() const
+	{
+		return audio_input != CELL_REC_PARAM_AUDIO_INPUT_DISABLE             // != DISABLE means that cellRec will add samples on its own
+			&& audio_input_mix_vol > CELL_REC_PARAM_AUDIO_INPUT_MIX_VOL_MIN; // We need to mix cellRec audio with internal audio
+	}
+
+	bool use_internal_audio() const
+	{
+		return audio_input == CELL_REC_PARAM_AUDIO_INPUT_DISABLE             // DISABLE means that cellRec won't add samples on its own
+		    || audio_input_mix_vol < CELL_REC_PARAM_AUDIO_INPUT_MIX_VOL_MAX; // We need to mix cellRec audio with internal audio
+	}
+
+	bool use_internal_video() const
+	{
+		return video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE; // DISABLE means that cellRec won't add frames on its own
+	}
 };
 
-constexpr u32 rec_framerate = 30; // Always 30 fps
+static constexpr u32 rec_framerate = 30; // Always 30 fps
+static constexpr u32 rec_channels = 2; // Always 2 channels
 
-class rec_image_sink : public utils::image_sink
+class rec_video_sink : public utils::video_sink
 {
 public:
-	rec_image_sink() : utils::image_sink()
+	rec_video_sink() : utils::video_sink()
 	{
 		m_framerate = rec_framerate;
 	}
 
+	void set_sample_rate(u32 sample_rate)
+	{
+		m_sample_rate = sample_rate;
+	}
+
 	void stop(bool flush = true) override
 	{
-		cellRec.notice("Stopping image sink. flush=%d", flush);
+		cellRec.notice("Stopping video sink. flush=%d", flush);
 
 		std::lock_guard lock(m_mtx);
 		m_flush = flush;
+		m_paused = false;
 		m_frames_to_encode.clear();
+		m_samples_to_encode.clear();
 		has_error = false;
 	}
 
-	void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) override
+	void pause(bool flush = true) override
 	{
+		cellRec.notice("Pausing video sink. flush=%d", flush);
+
 		std::lock_guard lock(m_mtx);
+		m_flush = flush;
+		m_paused = true;
+	}
 
-		if (m_flush)
-			return;
+	void resume() override
+	{
+		cellRec.notice("Resuming video sink");
 
-		m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
+		std::lock_guard lock(m_mtx);
+		m_flush = false;
+		m_paused = false;
 	}
 
 	encoder_frame get_frame()
@@ -181,6 +215,20 @@ class rec_image_sink : public utils::image_sink
 
 		return {};
 	}
+
+	encoder_sample get_sample()
+	{
+		std::lock_guard lock(m_mtx);
+
+		if (!m_samples_to_encode.empty())
+		{
+			encoder_sample block = std::move(m_samples_to_encode.front());
+			m_samples_to_encode.pop_front();
+			return block;
+		}
+
+		return {};
+	}
 };
 
 struct rec_info
@@ -196,11 +244,21 @@ struct rec_info
 	vm::bptr<u8> video_input_buffer{}; // Used by the game to inject a frame right before it would render a frame to the screen.
 	vm::bptr<u8> audio_input_buffer{}; // Used by the game to inject audio: 2-channel interleaved (left-right) * 256 samples * sizeof(f32) at 48000 kHz
 
-	std::vector<utils::image_sink::encoder_frame> video_ringbuffer;
-	std::vector<u8> audio_ringbuffer;
+	// Wrapper for our audio data
+	struct audio_block
+	{
+		// 2-channel interleaved (left-right), 256 samples, float
+		static constexpr usz block_size = rec_channels * CELL_REC_AUDIO_BLOCK_SAMPLES * sizeof(f32);
+		std::array<u8, block_size> block{};
+		s64 pts{};
+	};
+
+	std::vector<utils::video_sink::encoder_frame> video_ringbuffer;
+	std::vector<audio_block> audio_ringbuffer;
 	usz video_ring_pos = 0;
+	usz audio_ring_pos = 0;
 	usz video_ring_frame_count = 0;
-	usz audio_ring_step = 0;
+	usz audio_ring_block_count = 0;
 
 	usz next_video_ring_pos()
 	{
@@ -209,11 +267,17 @@ struct rec_info
 		return pos;
 	}
 
-	std::shared_ptr<rec_image_sink> image_sink;
+	usz next_audio_ring_pos()
+	{
+		const usz pos = audio_ring_pos;
+		audio_ring_pos = (audio_ring_pos + 1) % audio_ringbuffer.size();
+		return pos;
+	}
+
+	std::shared_ptr<rec_video_sink> sink;
 	std::shared_ptr<utils::video_encoder> encoder;
-	std::unique_ptr<named_thread<std::function<void()>>> image_provider_thread;
+	std::unique_ptr<named_thread<std::function<void()>>> video_provider_thread;
 	atomic_t<bool> paused = false;
-	s64 last_pts = -1;
 
 	// Video parameters
 	utils::video_encoder::frame_format output_format{};
@@ -221,13 +285,13 @@ struct rec_info
 	u32 video_bps = 512000;
 	s32 video_codec_id = 12; // AV_CODEC_ID_MPEG4
 	s32 max_b_frames = 2;
-	const u32 fps = rec_framerate; // Always 30 fps
+	static constexpr u32 fps = rec_framerate; // Always 30 fps
 
 	// Audio parameters
 	u32 sample_rate = 48000;
 	u32 audio_bps = 64000;
 	s32 audio_codec_id = 86018; // AV_CODEC_ID_AAC
-	const u32 channels = 2; // Always 2 channels
+	static constexpr u32 channels = rec_channels; // Always 2 channels
 
 	// Recording duration
 	atomic_t<u64> recording_time_start = 0;
@@ -240,9 +304,9 @@ struct rec_info
 	void set_video_params(s32 video_format);
 	void set_audio_params(s32 audio_format);
 
-	void start_image_provider();
-	void pause_image_provider();
-	void stop_image_provider(bool flush);
+	void start_video_provider();
+	void pause_video_provider();
+	void stop_video_provider(bool flush);
 };
 
 void rec_info::set_video_params(s32 video_format)
@@ -507,37 +571,42 @@ void rec_info::set_audio_params(s32 audio_format)
 	cellRec.notice("set_audio_params: audio_format=0x%x, audio_codec_id=%d, sample_rate=%d, audio_bps=%d", audio_format, audio_codec_id, sample_rate, audio_bps);
 }
 
-void rec_info::start_image_provider()
+void rec_info::start_video_provider()
 {
 	const bool was_paused = paused.exchange(false);
 	utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-	if (image_provider_thread && was_paused)
+	if (video_provider_thread && was_paused)
 	{
 		// Resume
 		const u64 pause_time_end = get_system_time();
 		ensure(pause_time_end > pause_time_start);
 		pause_time_total += (pause_time_end - pause_time_start);
-		video_provider.set_pause_time(pause_time_total / 1000);
-		cellRec.notice("Resuming image provider.");
+		video_provider.set_pause_time_us(pause_time_total);
+		cellRec.notice("Resuming video provider.");
 		return;
 	}
 
-	cellRec.notice("Starting image provider.");
+	cellRec.notice("Starting video provider.");
 
 	recording_time_start = get_system_time();
+	pause_time_start = 0;
 	pause_time_total = 0;
-	video_provider.set_pause_time(0);
+	video_provider.set_pause_time_us(0);
 
-	image_provider_thread = std::make_unique<named_thread<std::function<void()>>>("cellRec Image Provider", [this]()
+	video_provider_thread = std::make_unique<named_thread<std::function<void()>>>("cellRec video provider", [this]()
 	{
-		const bool use_internal_audio = param.audio_input == CELL_REC_PARAM_AUDIO_INPUT_DISABLE || param.audio_input_mix_vol < 100;
-		const bool use_external_audio = param.audio_input != CELL_REC_PARAM_AUDIO_INPUT_DISABLE && param.audio_input_mix_vol > 0;
-		const bool use_external_video = param.video_input != CELL_REC_PARAM_VIDEO_INPUT_DISABLE;
+		const bool use_internal_audio = param.use_internal_audio();
+		const bool use_external_audio = param.use_external_audio();
+		const bool use_external_video = !param.use_internal_video();
 		const bool use_ring_buffer = param.ring_sec > 0;
 		const usz frame_size = input_format.pitch * input_format.height;
+		audio_block buffer_external{}; // for cellRec input
+		audio_block buffer_internal{}; // for cellAudio input
+		s64 last_video_pts = -1;
+		s64 last_audio_pts = -1;
 
-		cellRec.notice("image_provider_thread: use_ring_buffer=%d, video_ringbuffer_size=%d, audio_ringbuffer_size=%d, ring_sec=%d, frame_size=%d, use_external_video=%d, use_external_audio=%d, use_internal_audio=%d", use_ring_buffer, video_ringbuffer.size(), audio_ringbuffer.size(), param.ring_sec, frame_size, use_external_video, use_external_audio, use_internal_audio);
+		cellRec.notice("video_provider_thread: use_ring_buffer=%d, video_ringbuffer_size=%d, audio_ringbuffer_size=%d, ring_sec=%d, frame_size=%d, use_internal_video=%d, use_external_audio=%d, use_internal_audio=%d", use_ring_buffer, video_ringbuffer.size(), audio_ringbuffer.size(), param.ring_sec, frame_size, encoder->use_internal_video, use_external_audio, encoder->use_internal_audio);
 
 		while (thread_ctrl::state() != thread_state::aborting && encoder)
 		{
@@ -563,19 +632,25 @@ void rec_info::start_image_provider()
 				continue;
 			}
 
+			// We only care for new video frames or audio samples that can be properly encoded, so we check the timestamps and pts.
 			const usz timestamp_ms = (get_system_time() - recording_time_start - pause_time_total) / 1000;
 
-			// We only care for new video frames that can be properly encoded
+			/////////////////
+			//    VIDEO    //
+			/////////////////
+
 			// TODO: wait for flip before adding a frame
 			if (use_external_video)
 			{
-				if (const s64 pts = encoder->get_pts(timestamp_ms); pts > last_pts)
+				// The video frames originate from cellRec instead of our render pipeline.
+				if (const s64 pts = encoder->get_pts(timestamp_ms); pts > last_video_pts)
 				{
 					if (video_input_buffer)
 					{
 						if (use_ring_buffer)
 						{
-							utils::image_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
+							// The video frames originate from cellRec and are stored in a ringbuffer.
+							utils::video_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
 							frame_data.pts = pts;
 							frame_data.width = input_format.width;
 							frame_data.height = input_format.height;
@@ -586,107 +661,225 @@ void rec_info::start_image_provider()
 						}
 						else
 						{
+							// The video frames originate from cellRec and are pushed to the encoder immediately.
 							std::vector<u8> frame(frame_size);
 							std::memcpy(frame.data(), video_input_buffer.get_ptr(), frame.size());
 							encoder->add_frame(frame, input_format.pitch, input_format.width, input_format.height, input_format.av_pixel_format, timestamp_ms);
 						}
 					}
 
-					last_pts = pts;
+					last_video_pts = pts;
 				}
 			}
-			else if (use_ring_buffer && image_sink)
+			else if (sink)
 			{
-				utils::image_sink::encoder_frame frame = image_sink->get_frame();
+				// The video frames originate from our render pipeline.
+				utils::video_sink::encoder_frame frame = sink->get_frame();
 
-				if (const s64 pts = encoder->get_pts(frame.timestamp_ms); pts > last_pts && frame.data.size() > 0)
+				if (const s64 pts = encoder->get_pts(frame.timestamp_ms); pts > last_video_pts && !frame.data.empty())
 				{
 					ensure(frame.data.size() == frame_size);
-					utils::image_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
-					frame_data = std::move(frame);
-					frame_data.pts = pts;
-					last_pts = pts;
-					video_ring_frame_count++;
+
+					if (use_ring_buffer)
+					{
+						// The video frames originate from our render pipeline and are stored in a ringbuffer.
+						frame.pts = pts;
+						video_ringbuffer[next_video_ring_pos()] = std::move(frame);
+						video_ring_frame_count++;
+					}
+					else
+					{
+						// The video frames originate from our render pipeline and are directly encoded by the encoder.
+						encoder->add_frame(frame.data, frame.pitch, frame.width, frame.height, frame.av_pixel_format, frame.timestamp_ms);
+					}
+
+					last_video_pts = pts;
 				}
 			}
 
-			if (use_internal_audio)
-			{
-				// TODO: fetch audio
-			}
+			/////////////////
+			//    AUDIO    //
+			/////////////////
 
-			if (use_external_audio && audio_input_buffer)
+			const usz timestamp_us = get_system_time() - recording_time_start - pause_time_total;
+			bool got_new_samples = false;
+
+			if (use_external_audio)
 			{
-				// 2-channel interleaved (left-right), 256 samples, float
-				std::array<f32, 2 * 256> audio_data{};
-				std::memcpy(audio_data.data(), audio_input_buffer.get_ptr(), audio_data.size() * sizeof(f32));
+				if (const s64 pts = encoder->get_audio_pts(timestamp_us); pts > last_audio_pts)
+				{
+					if (audio_input_buffer)
+					{
+						// The audio samples originate from cellRec instead of our render pipeline.
+						// TODO: This needs to be synchronized with the game somehow if possible.
+						std::memcpy(buffer_external.block.data(), audio_input_buffer.get_ptr(), buffer_external.block.size());
+						buffer_external.pts = pts;
+						got_new_samples = true;
+					}
 
-				// TODO: mix audio with param.audio_input_mix_vol
+					last_audio_pts = pts;
+				}
 			}
 
-			if (use_ring_buffer)
+			if (sink && use_internal_audio)
 			{
-				// TODO: add audio properly
-				//std::memcpy(&ringbuffer[get_ring_pos(pts) + ring_audio_offset], audio_data.data(), audio_data.size());
+				// The audio samples originate from cellAudio and are stored in a ringbuffer.
+				utils::video_sink::encoder_sample sample = sink->get_sample();
+
+				if (!sample.data.empty() && sample.channels >= channels && sample.sample_count >= CELL_REC_AUDIO_BLOCK_SAMPLES)
+				{
+					s64 pts = encoder->get_audio_pts(sample.timestamp_us);
+
+					// Each encoder_sample can have more than one block
+					for (u32 i = 0; i < sample.sample_count; i += CELL_REC_AUDIO_BLOCK_SAMPLES)
+					{
+						if (pts > last_audio_pts)
+						{
+							const f32* src = reinterpret_cast<const f32*>(&sample.data[i * sample.channels * sizeof(f32)]);
+
+							// Copy the new samples to the internal buffer if we need them for volume mixing below.
+							// Otherwise copy them directly to the external buffer which is used for output later.
+							audio_block& dst_buffer = got_new_samples ? buffer_internal : buffer_external;
+
+							if (sample.channels > channels)
+							{
+								// Downmix channels
+								AudioBackend::downmix(CELL_REC_AUDIO_BLOCK_SAMPLES * sample.channels, sample.channels, channels, src, reinterpret_cast<f32*>(dst_buffer.block.data()));
+							}
+							else
+							{
+								std::memcpy(dst_buffer.block.data(), src, audio_block::block_size);
+							}
+
+							// Mix external and internal audio with param.audio_input_mix_vol if we already got samples from cellRec.
+							if (got_new_samples)
+							{
+								const float volume = std::clamp(param.audio_input_mix_vol / 100.0f, 0.0f, 1.0f);
+								const f32* src = reinterpret_cast<const f32*>(buffer_internal.block.data());
+								f32* dst = reinterpret_cast<f32*>(buffer_external.block.data());
+
+								for (u32 sample = 0; sample < (CELL_REC_AUDIO_BLOCK_SAMPLES * channels); sample++)
+								{
+									*dst = std::clamp(*dst + (*src++ * volume), -1.0f, 1.0f);
+									++dst;
+								}
+							}
+
+							last_audio_pts = std::max(pts, last_audio_pts); // The cellAudio pts may be older than the pts from cellRec
+							buffer_external.pts = last_audio_pts;
+							got_new_samples = true;
+						}
+
+						// We only take the first sample for simplicity for now
+						break;
+
+						// Increase pts for each sample block
+						//pts++;
+					}
+				}
 			}
-			else
+
+			if (got_new_samples)
 			{
-				// TODO: add audio to encoder
+				if (use_ring_buffer)
+				{
+					// Copy new sample to ringbuffer
+					audio_ringbuffer[next_audio_ring_pos()] = buffer_external;
+					audio_ring_block_count++;
+				}
+				else
+				{
+					// Push new sample to encoder
+					encoder->add_audio_samples(buffer_external.block.data(), CELL_REC_AUDIO_BLOCK_SAMPLES, channels, timestamp_us);
+				}
 			}
 
 			// Update recording time
-			recording_time_total = encoder->get_timestamp_ms(encoder->last_pts());
+			recording_time_total = encoder->get_timestamp_ms(encoder->last_video_pts());
 
-			thread_ctrl::wait_for(100);
+			thread_ctrl::wait_for(1);
 		}
 	});
 }
 
-void rec_info::pause_image_provider()
+void rec_info::pause_video_provider()
 {
-	cellRec.notice("Pausing image provider.");
+	cellRec.notice("Pausing video provider.");
 
-	if (image_provider_thread)
+	if (video_provider_thread)
 	{
 		paused = true;
 		pause_time_start = get_system_time();
 	}
 }
 
-void rec_info::stop_image_provider(bool flush)
+void rec_info::stop_video_provider(bool flush)
 {
-	cellRec.notice("Stopping image provider.");
+	cellRec.notice("Stopping video provider.");
 
-	if (image_provider_thread)
+	if (video_provider_thread)
 	{
-		auto& thread = *image_provider_thread;
+		auto& thread = *video_provider_thread;
 		thread = thread_state::aborting;
 		thread();
-		image_provider_thread.reset();
+		video_provider_thread.reset();
 	}
 
-	if (flush && param.ring_sec > 0 && !video_ringbuffer.empty())
+	// Flush the ringbuffer if necessary.
+	// This should only happen if the video sink is not the encoder itself.
+	// In this case the encoder should have been idle until now.
+	if (flush && param.ring_sec > 0 && (!video_ringbuffer.empty() || !audio_ringbuffer.empty()))
 	{
 		cellRec.notice("Flushing video ringbuffer.");
 
 		// Fill encoder with data from ringbuffer
 		// TODO: ideally the encoder should do this on the fly and overwrite old frames in the file.
 		ensure(encoder);
+		encoder->encode();
 
 		const usz frame_count = std::min(video_ringbuffer.size(), video_ring_frame_count);
-		const usz start_offset = video_ring_frame_count < video_ringbuffer.size() ? 0 : video_ring_frame_count;
-		const s64 start_pts = video_ringbuffer[start_offset % video_ringbuffer.size()].pts;
+		const usz video_start_offset = video_ring_frame_count < video_ringbuffer.size() ? 0 : video_ring_frame_count;
+		const s64 video_start_pts = video_ringbuffer.empty() ? 0 : video_ringbuffer[video_start_offset % video_ringbuffer.size()].pts;
+
+		const usz block_count = std::min(audio_ringbuffer.size(), audio_ring_block_count);
+		const usz audio_start_offset = audio_ring_block_count < audio_ringbuffer.size() ? 0 : audio_ring_block_count;
+		const s64 audio_start_pts = audio_ringbuffer.empty() ? 0 : audio_ringbuffer[audio_start_offset % audio_ringbuffer.size()].pts;
 
-		for (usz i = 0; i < frame_count; i++)
+		// Try to add the frames and samples in proper order
+		for (usz sync_timestamp_us = 0, frame = 0, block = 0; frame < frame_count || block < block_count; frame++)
 		{
-			const usz pos = (start_offset + i) % video_ringbuffer.size();
-			utils::image_sink::encoder_frame& frame_data = video_ringbuffer[pos];
-			encoder->add_frame(frame_data.data, frame_data.pitch, frame_data.width, frame_data.height, frame_data.av_pixel_format, encoder->get_timestamp_ms(frame_data.pts - start_pts));
+			// Add one frame
+			if (frame < frame_count)
+			{
+				const usz pos = (video_start_offset + frame) % video_ringbuffer.size();
+				utils::video_sink::encoder_frame& frame_data = video_ringbuffer[pos];
+				const usz timestamp_ms = encoder->get_timestamp_ms(frame_data.pts - video_start_pts);
+				encoder->add_frame(frame_data.data, frame_data.pitch, frame_data.width, frame_data.height, frame_data.av_pixel_format, timestamp_ms);
 
-			// TODO: add audio data to encoder
+				// Increase sync timestamp
+				sync_timestamp_us = timestamp_ms * 1000;
+			}
+
+			// Add all the samples that fit into the last frame
+			for (usz i = block; i < block_count; i++)
+			{
+				const usz pos = (audio_start_offset + i) % audio_ringbuffer.size();
+				const audio_block& sample_block = audio_ringbuffer[pos];
+				const usz timestamp_us = encoder->get_audio_timestamp_us(sample_block.pts - audio_start_pts);
+
+				// Stop adding new samples if the sync timestamp is exceeded, unless we already added all the frames.
+				if (timestamp_us > sync_timestamp_us && frame < frame_count)
+				{
+					break;
+				}
+
+				encoder->add_audio_samples(sample_block.block.data(), CELL_REC_AUDIO_BLOCK_SAMPLES, channels, timestamp_us);
+				block++;
+			}
 		}
 
 		video_ringbuffer.clear();
+		audio_ringbuffer.clear();
 	}
 }
 
@@ -927,11 +1120,11 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 
 			if (opt.value.audio_input == CELL_REC_PARAM_AUDIO_INPUT_DISABLE)
 			{
-				rec.param.audio_input_mix_vol = 0;
+				rec.param.audio_input_mix_vol = CELL_REC_PARAM_AUDIO_INPUT_MIX_VOL_MIN;
 			}
 			else
 			{
-				rec.param.audio_input_mix_vol = 100;
+				rec.param.audio_input_mix_vol = CELL_REC_PARAM_AUDIO_INPUT_MIX_VOL_MAX;
 			}
 			break;
 		}
@@ -1051,11 +1244,16 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 
 	rec.cb = cb;
 	rec.cbUserData = cbUserData;
-	rec.last_pts = -1;
-	rec.audio_ringbuffer.clear();
 	rec.video_ringbuffer.clear();
-	rec.video_ring_frame_count = 0;
+	rec.audio_ringbuffer.clear();
 	rec.video_ring_pos = 0;
+	rec.audio_ring_pos = 0;
+	rec.video_ring_frame_count = 0;
+	rec.audio_ring_block_count = 0;
+	rec.recording_time_start = 0;
+	rec.recording_time_total = 0;
+	rec.pause_time_start = 0;
+	rec.pause_time_total = 0;
 	rec.paused = false;
 
 	rec.set_video_params(pParam->videoFmt);
@@ -1063,25 +1261,32 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 
 	if (rec.param.ring_sec > 0)
 	{
-		const u32 audio_size_per_sample = rec.channels * sizeof(float);
-		const u32 audio_size_per_second = rec.sample_rate * audio_size_per_sample;
-		const usz audio_ring_buffer_size = rec.param.ring_sec * audio_size_per_second;
+		const usz audio_ring_buffer_size = static_cast<usz>(std::ceil((rec.param.ring_sec * rec.sample_rate) / static_cast<f32>(CELL_REC_AUDIO_BLOCK_SAMPLES)));
 		const usz video_ring_buffer_size = rec.param.ring_sec * rec.fps;
 
 		cellRec.notice("Preparing ringbuffer for %d seconds. video_ring_buffer_size=%d, audio_ring_buffer_size=%d, pitch=%d, width=%d, height=%d", rec.param.ring_sec, video_ring_buffer_size, audio_ring_buffer_size, rec.input_format.pitch, rec.input_format.width, rec.input_format.height);
 
 		rec.audio_ringbuffer.resize(audio_ring_buffer_size);
-		rec.audio_ring_step = audio_size_per_sample;
-		rec.video_ringbuffer.resize(video_ring_buffer_size, {});
-		rec.image_sink = std::make_shared<rec_image_sink>();
+		rec.video_ringbuffer.resize(video_ring_buffer_size);
+	}
+
+	if (rec.param.use_internal_audio() || rec.param.use_internal_video())
+	{
+		rec.sink = std::make_shared<rec_video_sink>();
+		rec.sink->use_internal_audio = rec.param.use_internal_audio();
+		rec.sink->use_internal_video = rec.param.use_internal_video();
+		rec.sink->set_sample_rate(rec.sample_rate);
 	}
 
 	rec.encoder = std::make_shared<utils::video_encoder>();
+	rec.encoder->use_internal_audio = false; // We use the other sink
+	rec.encoder->use_internal_video = false; // We use the other sink
 	rec.encoder->set_path(vfs::get(rec.param.filename));
 	rec.encoder->set_framerate(rec.fps);
 	rec.encoder->set_video_bitrate(rec.video_bps);
 	rec.encoder->set_video_codec(rec.video_codec_id);
 	rec.encoder->set_sample_rate(rec.sample_rate);
+	rec.encoder->set_audio_channels(rec.channels);
 	rec.encoder->set_audio_bitrate(rec.audio_bps);
 	rec.encoder->set_audio_codec(rec.audio_codec_id);
 	rec.encoder->set_output_format(rec.output_format);
@@ -1113,13 +1318,13 @@ error_code cellRecClose(s32 isDiscard)
 
 		if (isDiscard)
 		{
-			// No need to flush
-			rec.stop_image_provider(false);
+			// No need to flush the encoder
+			rec.stop_video_provider(false);
 			rec.encoder->stop(false);
 
-			if (rec.image_sink)
+			if (rec.sink)
 			{
-				rec.image_sink->stop(false);
+				rec.sink->stop(true);
 			}
 
 			if (fs::is_file(rec.param.filename))
@@ -1135,18 +1340,18 @@ error_code cellRecClose(s32 isDiscard)
 		else
 		{
 			// Flush to make sure we encode all remaining frames
-			rec.stop_image_provider(true);
+			rec.stop_video_provider(true);
 			rec.encoder->stop(true);
-			rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_pts());
+			rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_video_pts());
 
-			if (rec.image_sink)
+			if (rec.sink)
 			{
-				rec.image_sink->stop(true);
+				rec.sink->stop(true);
 			}
 
 			const s64 start_pts = rec.encoder->get_pts(rec.param.scene_metadata.start_time);
 			const s64 end_pts = rec.encoder->get_pts(rec.param.scene_metadata.end_time);
-			const s64 last_pts = rec.encoder->last_pts();
+			const s64 last_pts = rec.encoder->last_video_pts();
 
 			is_valid_range = start_pts >= 0 && end_pts <= last_pts;
 		}
@@ -1157,8 +1362,8 @@ error_code cellRecClose(s32 isDiscard)
 		g_fxo->need<utils::video_provider>();
 		utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-		// Release the image sink if it was used
-		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
+		// Release the video sink if it was used
+		if (rec.param.use_internal_video() || rec.param.use_internal_audio())
 		{
 			const recording_mode old_mode = g_recording_mode.exchange(recording_mode::stopped);
 
@@ -1167,15 +1372,15 @@ error_code cellRecClose(s32 isDiscard)
 				cellRec.error("cellRecClose: Unexpected recording mode %s found while stopping video capture.", old_mode);
 			}
 
-			if (!video_provider.set_image_sink(nullptr, recording_mode::cell))
+			if (!video_provider.set_video_sink(nullptr, recording_mode::cell))
 			{
-				cellRec.error("cellRecClose failed to release image sink");
+				cellRec.error("cellRecClose failed to release video sink");
 			}
 		}
 
 		rec.param = {};
 		rec.encoder.reset();
-		rec.image_sink.reset();
+		rec.sink.reset();
 		rec.audio_ringbuffer.clear();
 		rec.video_ringbuffer.clear();
 		rec.state = rec_state::closed;
@@ -1207,24 +1412,18 @@ error_code cellRecStop()
 
 	sysutil_register_cb([&rec](ppu_thread& ppu) -> s32
 	{
-		// Disable image sink if it was used
-		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
-		{
-			const recording_mode old_mode = g_recording_mode.exchange(recording_mode::stopped);
+		// cellRecStop actually just pauses the recording
+		rec.pause_video_provider();
 
-			if (old_mode != recording_mode::cell && old_mode != recording_mode::stopped)
-			{
-				cellRec.error("cellRecStop: Unexpected recording mode %s found while stopping video capture. (ring_sec=%d)", old_mode, rec.param.ring_sec);
-			}
+		if (rec.sink)
+		{
+			rec.sink->pause(true);
 		}
 
-		// cellRecStop actually just pauses the recording
-		rec.pause_image_provider();
-
 		ensure(!!rec.encoder);
 		rec.encoder->pause(true);
 
-		rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_pts());
+		rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_video_pts());
 		rec.state = rec_state::stopped;
 
 		rec.cb(ppu, CELL_REC_STATUS_STOP, CELL_OK, rec.cbUserData);
@@ -1249,33 +1448,23 @@ error_code cellRecStart()
 	{
 		// Start/resume the recording
 		ensure(!!rec.encoder);
-		rec.encoder->encode();
+
+		if (rec.param.ring_sec == 0)
+		{
+			rec.encoder->encode();
+		}
 
 		g_fxo->need<utils::video_provider>();
 		utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-		// Setup an image sink if it is needed
-		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
+		// Setup a video sink if it is needed
+		if (rec.param.use_internal_video() || rec.param.use_internal_audio())
 		{
-			if (rec.param.ring_sec <= 0)
+			if (rec.sink && !video_provider.set_video_sink(rec.sink, recording_mode::cell))
 			{
-				// Regular recording
-				if (!video_provider.set_image_sink(rec.encoder, recording_mode::cell))
-				{
-					cellRec.error("Failed to set image sink");
-					rec.cb(ppu, CELL_REC_STATUS_ERR, CELL_REC_ERROR_FATAL, rec.cbUserData);
-					return CELL_OK;
-				}
-			}
-			else
-			{
-				// Ringbuffer recording
-				if (!video_provider.set_image_sink(rec.image_sink, recording_mode::cell))
-				{
-					cellRec.error("Failed to set image sink");
-					rec.cb(ppu, CELL_REC_STATUS_ERR, CELL_REC_ERROR_FATAL, rec.cbUserData);
-					return CELL_OK;
-				}
+				cellRec.error("Failed to set video sink");
+				rec.cb(ppu, CELL_REC_STATUS_ERR, CELL_REC_ERROR_FATAL, rec.cbUserData);
+				return CELL_OK;
 			}
 
 			// Force rsx recording
@@ -1287,7 +1476,12 @@ error_code cellRecStart()
 			g_recording_mode = recording_mode::stopped;
 		}
 
-		rec.start_image_provider();
+		rec.start_video_provider();
+
+		if (rec.sink)
+		{
+			rec.sink->resume();
+		}
 
 		if (rec.encoder->has_error)
 		{
diff --git a/rpcs3/Emu/Cell/Modules/cellSail.cpp b/rpcs3/Emu/Cell/Modules/cellSail.cpp
index 3fd558beb6dc..76a6d3c6fda0 100644
--- a/rpcs3/Emu/Cell/Modules/cellSail.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSail.cpp
@@ -639,7 +639,7 @@ error_code cellSailPlayerInitialize2(ppu_thread& ppu,
 	pSelf->paused = true;
 
 	{
-		CellSailEvent event;
+		CellSailEvent event{};
 		event.u32x2.major = CELL_SAIL_EVENT_PLAYER_STATE_CHANGED;
 		event.u32x2.minor = 0;
 		pSelf->callback(ppu, pSelf->callbackArg, event, CELL_SAIL_PLAYER_STATE_INITIALIZED, 0);
@@ -778,7 +778,7 @@ error_code cellSailPlayerBoot(ppu_thread& ppu, vm::ptr<CellSailPlayer> pSelf, u6
 	cellSail.warning("cellSailPlayerBoot(pSelf=*0x%x, userParam=%d)", pSelf, userParam);
 
 	{
-		CellSailEvent event;
+		CellSailEvent event{};
 		event.u32x2.major = CELL_SAIL_EVENT_PLAYER_STATE_CHANGED;
 		event.u32x2.minor = 0;
 		pSelf->callback(ppu, pSelf->callbackArg, event, CELL_SAIL_PLAYER_STATE_BOOT_TRANSITION, 0);
@@ -788,7 +788,7 @@ error_code cellSailPlayerBoot(ppu_thread& ppu, vm::ptr<CellSailPlayer> pSelf, u6
 	pSelf->booted = true;
 
 	{
-		CellSailEvent event;
+		CellSailEvent event{};
 		event.u32x2.major = CELL_SAIL_EVENT_PLAYER_CALL_COMPLETED;
 		event.u32x2.minor = CELL_SAIL_PLAYER_CALL_BOOT;
 		pSelf->callback(ppu, pSelf->callbackArg, event, 0, 0);
diff --git a/rpcs3/Emu/Cell/lv2/sys_rsxaudio.cpp b/rpcs3/Emu/Cell/lv2/sys_rsxaudio.cpp
index 701d741341e4..b11065748b1b 100644
--- a/rpcs3/Emu/Cell/lv2/sys_rsxaudio.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_rsxaudio.cpp
@@ -1842,39 +1842,8 @@ u32 rsxaudio_backend_thread::write_data_callback(u32 bytes, void* buf)
 			return bytes;
 		}
 
-		if (cb_cfg.input_ch_cnt > cb_cfg.output_ch_cnt)
-		{
-			if (cb_cfg.input_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_7_1))
-			{
-				if (cb_cfg.output_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_5_1))
-				{
-					AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::SURROUND_5_1>(sample_cnt, callback_tmp_buf.data(), callback_tmp_buf.data());
-				}
-				else if (cb_cfg.output_ch_cnt == static_cast<u32>(AudioChannelCnt::STEREO))
-				{
-					AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::STEREO>(sample_cnt, callback_tmp_buf.data(), callback_tmp_buf.data());
-				}
-				else
-				{
-					fmt::throw_exception("Invalid downmix combination: %u -> %u", cb_cfg.input_ch_cnt, cb_cfg.output_ch_cnt);
-				}
-			}
-			else if (cb_cfg.input_ch_cnt == static_cast<u32>(AudioChannelCnt::SURROUND_5_1))
-			{
-				if (cb_cfg.output_ch_cnt == static_cast<u32>(AudioChannelCnt::STEREO))
-				{
-					AudioBackend::downmix<AudioChannelCnt::SURROUND_5_1, AudioChannelCnt::STEREO>(sample_cnt, callback_tmp_buf.data(), callback_tmp_buf.data());
-				}
-				else
-				{
-					fmt::throw_exception("Invalid downmix combination: %u -> %u", cb_cfg.input_ch_cnt, cb_cfg.output_ch_cnt);
-				}
-			}
-			else
-			{
-				fmt::throw_exception("Invalid downmix combination: %u -> %u", cb_cfg.input_ch_cnt, cb_cfg.output_ch_cnt);
-			}
-		}
+		// Downmix if necessary
+		AudioBackend::downmix(sample_cnt, cb_cfg.input_ch_cnt, cb_cfg.output_ch_cnt, callback_tmp_buf.data(), callback_tmp_buf.data());
 
 		if (cb_cfg.target_volume != cb_cfg.current_volume)
 		{
diff --git a/rpcs3/Emu/Io/recording_config.h b/rpcs3/Emu/Io/recording_config.h
index f1e2e58242ae..127d24015fd5 100644
--- a/rpcs3/Emu/Io/recording_config.h
+++ b/rpcs3/Emu/Io/recording_config.h
@@ -8,14 +8,29 @@ struct cfg_recording final : cfg::node
 	bool load();
 	void save() const;
 
-	cfg::uint<0, 60> framerate{this, "Framerate", 30};
-	cfg::uint<0, 7680> width{this, "Width", 1280};
-	cfg::uint<0, 4320> height{this, "Height", 720};
-	cfg::uint<0, 192> pixel_format{this, "AVPixelFormat", 0}; // AVPixelFormat::AV_PIX_FMT_YUV420P
-	cfg::uint<0, 32813> video_codec{this, "AVCodecID", 12}; // AVCodecID::AV_CODEC_ID_MPEG4
-	cfg::uint<0, 25000000> video_bps{this, "Video Bitrate", 4000000};
-	cfg::uint<0, 5> max_b_frames{this, "Max B-Frames", 2};
-	cfg::uint<0, 20> gop_size{this, "Group of Pictures Size", 12};
+	struct node_video : cfg::node
+	{
+		node_video(cfg::node* _this) : cfg::node(_this, "Video") {}
+
+		cfg::uint<0, 60> framerate{this, "Framerate", 30};
+		cfg::uint<0, 7680> width{this, "Width", 1280};
+		cfg::uint<0, 4320> height{this, "Height", 720};
+		cfg::uint<0, 192> pixel_format{this, "AVPixelFormat", 0}; // AVPixelFormat::AV_PIX_FMT_YUV420P
+		cfg::uint<0, 0xFFFF> video_codec{this, "AVCodecID", 12}; // AVCodecID::AV_CODEC_ID_MPEG4
+		cfg::uint<0, 25000000> video_bps{this, "Video Bitrate", 4000000};
+		cfg::uint<0, 5> max_b_frames{this, "Max B-Frames", 2};
+		cfg::uint<0, 20> gop_size{this, "Group of Pictures Size", 12};
+
+	} video{ this };
+
+	struct node_audio : cfg::node
+	{
+		node_audio(cfg::node* _this) : cfg::node(_this, "Audio") {}
+		
+		cfg::uint<0x10000, 0x17000> audio_codec{this, "AVCodecID", 86018}; // AVCodecID::AV_CODEC_ID_AAC
+		cfg::uint<0, 25000000> audio_bps{this, "Audio Bitrate", 320000};
+
+	} audio{ this };
 
 	const std::string path;
 };
diff --git a/rpcs3/Loader/PSF.cpp b/rpcs3/Loader/PSF.cpp
index 7d7e878bee5c..e455a71404bd 100644
--- a/rpcs3/Loader/PSF.cpp
+++ b/rpcs3/Loader/PSF.cpp
@@ -295,7 +295,7 @@ namespace psf
 
 		for (const auto& entry : psf)
 		{
-			def_table_t index;
+			def_table_t index{};
 			index.key_off = ::narrow<u32>(key_offset);
 			index.param_fmt = entry.second.type();
 			index.param_len = entry.second.size();
@@ -313,7 +313,7 @@ namespace psf
 		key_offset = utils::align(key_offset, 4);
 
 		// Generate header
-		header_t header;
+		header_t header{};
 		header.magic = "\0PSF"_u32;
 		header.version = 0x101;
 		header.off_key_table = ::narrow<u32>(sizeof(header_t) + sizeof(def_table_t) * psf.size());
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index cc7c60f51d01..0d7a9a74ea1d 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -618,7 +618,7 @@
     <ClInclude Include="Loader\mself.hpp" />
     <ClInclude Include="util\atomic.hpp" />
     <ClInclude Include="util\bless.hpp" />
-    <ClInclude Include="util\image_sink.h" />
+    <ClInclude Include="util\video_sink.h" />
     <ClInclude Include="util\video_provider.h" />
     <ClInclude Include="util\media_utils.h" />
     <ClInclude Include="util\serialization.hpp" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 5a11ef535faf..447430681ef8 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -2275,7 +2275,7 @@
     <ClInclude Include="util\video_provider.h">
       <Filter>Utilities</Filter>
     </ClInclude>
-    <ClInclude Include="util\image_sink.h">
+    <ClInclude Include="util\video_sink.h">
       <Filter>Utilities</Filter>
     </ClInclude>
     <ClInclude Include="Emu\Io\recording_config.h">
diff --git a/rpcs3/rpcs3qt/gs_frame.cpp b/rpcs3/rpcs3qt/gs_frame.cpp
index f14213cb5275..498e4ac3f544 100644
--- a/rpcs3/rpcs3qt/gs_frame.cpp
+++ b/rpcs3/rpcs3qt/gs_frame.cpp
@@ -12,6 +12,7 @@
 #include "Emu/IdManager.h"
 #include "Emu/Cell/Modules/cellScreenshot.h"
 #include "Emu/Cell/Modules/cellVideoOut.h"
+#include "Emu/Cell/Modules/cellAudio.h"
 #include "Emu/RSX/rsx_utils.h"
 #include "Emu/RSX/Overlays/overlay_message.h"
 #include "Emu/Io/recording_config.h"
@@ -445,9 +446,9 @@ void gs_frame::toggle_recording()
 	{
 		m_video_encoder->stop();
 
-		if (!video_provider.set_image_sink(nullptr, recording_mode::rpcs3))
+		if (!video_provider.set_video_sink(nullptr, recording_mode::rpcs3))
 		{
-			gui_log.warning("The video provider could not release the image sink. A sink with higher priority must have been set.");
+			gui_log.warning("The video provider could not release the video sink. A sink with higher priority must have been set.");
 		}
 
 		// Play a sound
@@ -489,21 +490,24 @@ void gs_frame::toggle_recording()
 		video_path += "recording_" + date_time::current_time_narrow<'_'>() + ".mp4";
 
 		utils::video_encoder::frame_format output_format{};
-		output_format.av_pixel_format = static_cast<AVPixelFormat>(g_cfg_recording.pixel_format.get());
-		output_format.width = g_cfg_recording.width;
-		output_format.height = g_cfg_recording.height;
-		output_format.pitch = g_cfg_recording.width * 4;
+		output_format.av_pixel_format = static_cast<AVPixelFormat>(g_cfg_recording.video.pixel_format.get());
+		output_format.width = g_cfg_recording.video.width;
+		output_format.height = g_cfg_recording.video.height;
+		output_format.pitch = g_cfg_recording.video.width * 4;
 
+		m_video_encoder->use_internal_audio = true;
+		m_video_encoder->use_internal_video = true;
 		m_video_encoder->set_path(video_path);
-		m_video_encoder->set_framerate(g_cfg_recording.framerate);
-		m_video_encoder->set_video_bitrate(g_cfg_recording.video_bps);
-		m_video_encoder->set_video_codec(g_cfg_recording.video_codec);
-		m_video_encoder->set_max_b_frames(g_cfg_recording.max_b_frames);
-		m_video_encoder->set_gop_size(g_cfg_recording.gop_size);
+		m_video_encoder->set_framerate(g_cfg_recording.video.framerate);
+		m_video_encoder->set_video_bitrate(g_cfg_recording.video.video_bps);
+		m_video_encoder->set_video_codec(g_cfg_recording.video.video_codec);
+		m_video_encoder->set_max_b_frames(g_cfg_recording.video.max_b_frames);
+		m_video_encoder->set_gop_size(g_cfg_recording.video.gop_size);
 		m_video_encoder->set_output_format(output_format);
-		m_video_encoder->set_sample_rate(0);   // TODO
-		m_video_encoder->set_audio_bitrate(0); // TODO
-		m_video_encoder->set_audio_codec(0);   // TODO
+		m_video_encoder->set_sample_rate(g_fxo->get<cell_audio>().cfg.audio_sampling_rate);
+		m_video_encoder->set_audio_channels(static_cast<u32>(g_fxo->get<cell_audio>().cfg.audio_channels));
+		m_video_encoder->set_audio_bitrate(g_cfg_recording.audio.audio_bps);
+		m_video_encoder->set_audio_codec(g_cfg_recording.audio.audio_codec);
 		m_video_encoder->encode();
 
 		if (m_video_encoder->has_error)
@@ -513,15 +517,15 @@ void gs_frame::toggle_recording()
 			return;
 		}
 
-		if (!video_provider.set_image_sink(m_video_encoder, recording_mode::rpcs3))
+		if (!video_provider.set_video_sink(m_video_encoder, recording_mode::rpcs3))
 		{
-			gui_log.warning("The video provider could not set the image sink. A sink with higher priority must have been set.");
+			gui_log.warning("The video provider could not set the video sink. A sink with higher priority must have been set.");
 			rsx::overlays::queue_message(tr("Recording not possible").toStdString());
 			m_video_encoder->stop();
 			return;
 		}
 
-		video_provider.set_pause_time(0);
+		video_provider.set_pause_time_us(0);
 
 		g_recording_mode = recording_mode::rpcs3;
 
diff --git a/rpcs3/rpcs3qt/gs_frame.h b/rpcs3/rpcs3qt/gs_frame.h
index 4872de1dad43..a5e69fa958b8 100644
--- a/rpcs3/rpcs3qt/gs_frame.h
+++ b/rpcs3/rpcs3qt/gs_frame.h
@@ -5,7 +5,6 @@
 #include "util/types.hpp"
 #include "util/atomic.hpp"
 #include "util/media_utils.h"
-#include "util/video_provider.h"
 #include "Emu/RSX/GSFrameBase.h"
 
 #include <QWindow>
diff --git a/rpcs3/util/image_sink.h b/rpcs3/util/image_sink.h
deleted file mode 100644
index 3c23eca514ce..000000000000
--- a/rpcs3/util/image_sink.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include "util/types.hpp"
-#include "util/atomic.hpp"
-#include "Utilities/mutex.h"
-
-#include <deque>
-#include <cmath>
-
-namespace utils
-{
-	class image_sink
-	{
-	public:
-		image_sink() = default;
-
-		virtual void stop(bool flush = true) = 0;
-		virtual void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) = 0;
-
-		s64 get_pts(usz timestamp_ms) const
-		{
-			return static_cast<s64>(std::round((timestamp_ms * m_framerate) / 1000.f));
-		}
-
-		usz get_timestamp_ms(s64 pts) const
-		{
-			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_framerate)));
-		}
-
-		atomic_t<bool> has_error{false};
-
-		struct encoder_frame
-		{
-			encoder_frame() = default;
-			encoder_frame(usz timestamp_ms, u32 pitch, u32 width, u32 height, s32 av_pixel_format, std::vector<u8>&& data)
-				: timestamp_ms(timestamp_ms), pitch(pitch), width(width), height(height), av_pixel_format(av_pixel_format), data(std::move(data))
-			{}
-
-			s64 pts = -1; // Optional
-			usz timestamp_ms = 0;
-			u32 pitch = 0;
-			u32 width = 0;
-			u32 height = 0;
-			s32 av_pixel_format = 0; // NOTE: Make sure this is a valid AVPixelFormat
-			std::vector<u8> data;
-		};
-
-	protected:
-		shared_mutex m_mtx;
-		std::deque<encoder_frame> m_frames_to_encode;
-		atomic_t<bool> m_flush = false;
-		u32 m_framerate = 0;
-	};
-}
diff --git a/rpcs3/util/media_utils.cpp b/rpcs3/util/media_utils.cpp
index 3dddd5b0f872..9e21298dabf1 100644
--- a/rpcs3/util/media_utils.cpp
+++ b/rpcs3/util/media_utils.cpp
@@ -32,6 +32,28 @@ LOG_CHANNEL(media_log, "Media");
 
 namespace utils
 {
+	template <typename T>
+	static inline void write_byteswapped(const u8* src, u8* dst)
+	{
+		*reinterpret_cast<T*>(dst) = *reinterpret_cast<const be_t<T>*>(src);
+	}
+
+	template <typename T>
+	static inline void copy_samples(const u8* src, u8* dst, usz sample_count, bool swap_endianness)
+	{
+		if (swap_endianness)
+		{
+			for (usz i = 0; i < sample_count; i++)
+			{
+				write_byteswapped<T>(src + i * sizeof(T), dst + i * sizeof(T));
+			}
+		}
+		else
+		{
+			std::memcpy(dst, src, sample_count * sizeof(T));
+		}
+	}
+
 	template <>
 	std::string media_info::get_metadata(const std::string& key, const std::string& def) const
 	{
@@ -204,11 +226,19 @@ namespace utils
 
 	struct scoped_av
 	{
-		AVFormatContext* format = nullptr;
-		const AVCodec* codec = nullptr;
-		AVCodecContext* context = nullptr;
-		AVFrame* frame = nullptr;
-		AVStream* stream = nullptr;
+		struct ctx
+		{
+			const AVCodec* codec = nullptr;
+			AVCodecContext* context = nullptr;
+			AVStream* stream = nullptr;
+			AVPacket* packet = nullptr;
+			AVFrame* frame = nullptr;
+		};
+
+		ctx audio{};
+		ctx video{};
+
+		AVFormatContext* format_context = nullptr;
 		SwrContext* swr = nullptr;
 		SwsContext* sws = nullptr;
 		std::function<void()> kill_callback = nullptr;
@@ -216,21 +246,38 @@ namespace utils
 		~scoped_av()
 		{
 			// Clean up
-			if (frame)
+			if (audio.frame)
+			{
+				av_frame_unref(audio.frame);
+				av_frame_free(&audio.frame);
+			}
+			if (video.frame)
+			{
+				av_frame_unref(video.frame);
+				av_frame_free(&video.frame);
+			}
+			if (audio.packet)
+			{
+				av_packet_unref(audio.packet);
+				av_packet_free(&audio.packet);
+			}
+			if (video.packet)
 			{
-				av_frame_unref(frame);
-				av_frame_free(&frame);
+				av_packet_unref(video.packet);
+				av_packet_free(&video.packet);
 			}
 			if (swr)
 				swr_free(&swr);
 			if (sws)
 				sws_freeContext(sws);
-			if (context)
-				avcodec_close(context);
+			if (audio.context)
+				avcodec_close(audio.context);
+			if (video.context)
+				avcodec_close(video.context);
 			// AVCodec is managed by libavformat, no need to free it
 			// see: https://stackoverflow.com/a/18047320
-			if (format)
-				avformat_free_context(format);
+			if (format_context)
+				avformat_free_context(format_context);
 			//if (stream)
 			//	av_free(stream);
 			if (kill_callback)
@@ -238,6 +285,107 @@ namespace utils
 		}
 	};
 
+	static std::string channel_layout_name(const AVChannelLayout& ch_layout)
+	{
+		std::vector<char> ch_layout_buf(64);
+		int len = av_channel_layout_describe(&ch_layout, ch_layout_buf.data(), ch_layout_buf.size());
+		if (len < 0)
+		{
+			media_log.error("av_channel_layout_describe failed. Error: %d='%s'", len, av_error_to_string(len));
+			return {};
+		}
+
+		if (len > static_cast<int>(ch_layout_buf.size()))
+		{
+			// Try again with a bigger buffer
+			media_log.notice("av_channel_layout_describe needs a bigger buffer: len=%d", len);
+			ch_layout_buf.clear();
+			ch_layout_buf.resize(len);
+
+			len = av_channel_layout_describe(&ch_layout, ch_layout_buf.data(), ch_layout_buf.size());
+			if (len < 0)
+			{
+				media_log.error("av_channel_layout_describe failed. Error: %d='%s'", len, av_error_to_string(len));
+				return {};
+			}
+		}
+
+		return ch_layout_buf.data();
+	}
+
+	// check that a given sample format is supported by the encoder
+	static bool check_sample_fmt(const AVCodec* codec, enum AVSampleFormat sample_fmt)
+	{
+		if (!codec) return false;
+
+		for (const AVSampleFormat* p = codec->sample_fmts; p && *p != AV_SAMPLE_FMT_NONE; p++)
+		{
+			if (*p == sample_fmt)
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+
+	// just pick the highest supported samplerate
+	static int select_sample_rate(const AVCodec* codec)
+	{
+		if (!codec || !codec->supported_samplerates)
+			return 48000;
+
+		int best_samplerate = 0;
+		for (const int* samplerate = codec->supported_samplerates; samplerate && *samplerate != 0; samplerate++)
+		{
+			if (!best_samplerate || abs(48000 - *samplerate) < abs(48000 - best_samplerate))
+			{
+				best_samplerate = *samplerate;
+			}
+		}
+		return best_samplerate;
+	}
+
+	AVChannelLayout get_preferred_channel_layout(int channels)
+	{
+		switch (channels)
+		{
+		case 2:
+			return AV_CHANNEL_LAYOUT_STEREO;
+		case 6:
+			return AV_CHANNEL_LAYOUT_5POINT1;
+		case 8:
+			return AV_CHANNEL_LAYOUT_7POINT1;
+		default:
+			break;
+		}
+		return {};
+	}
+
+	static constexpr AVChannelLayout empty_ch_layout = {};
+
+	// select layout with the exact channel count
+	static const AVChannelLayout* select_channel_layout(const AVCodec* codec, int channels)
+	{
+		if (!codec) return nullptr;
+
+		const AVChannelLayout preferred_ch_layout = get_preferred_channel_layout(channels);
+		const AVChannelLayout* found_ch_layout = nullptr;
+
+		for (const AVChannelLayout* ch_layout = codec->ch_layouts;
+			 ch_layout && memcmp(ch_layout, &empty_ch_layout, sizeof(AVChannelLayout)) != 0;
+			 ch_layout++)
+		{
+			media_log.notice("select_channel_layout: listing channel layout '%s' with %d channels", channel_layout_name(*ch_layout), ch_layout->nb_channels);
+
+			if (ch_layout->nb_channels == channels && memcmp(ch_layout, &preferred_ch_layout, sizeof(AVChannelLayout)) == 0)
+			{
+				found_ch_layout = ch_layout;
+			}
+		}
+
+		return found_ch_layout;
+	}
+
 	audio_decoder::audio_decoder()
 	{
 	}
@@ -263,7 +411,6 @@ namespace utils
 		track_fully_consumed = 0;
 		has_error = false;
 		m_size = 0;
-		duration_ms = 0;
 		timestamps_ms.clear();
 		data.clear();
 	}
@@ -295,14 +442,14 @@ namespace utils
 			scoped_av av;
 
 			// Get format from audio file
-			av.format = avformat_alloc_context();
-			if (int err = avformat_open_input(&av.format, path.c_str(), nullptr, nullptr); err < 0)
+			av.format_context = avformat_alloc_context();
+			if (int err = avformat_open_input(&av.format_context, path.c_str(), nullptr, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Could not open file '%s'. Error: %d='%s'", path, err, av_error_to_string(err));
 				has_error = true;
 				return;
 			}
-			if (int err = avformat_find_stream_info(av.format, nullptr); err < 0)
+			if (int err = avformat_find_stream_info(av.format_context, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Could not retrieve stream info from file '%s'. Error: %d='%s'", path, err, av_error_to_string(err));
 				has_error = true;
@@ -312,11 +459,11 @@ namespace utils
 			// Find the first audio stream
 			AVStream* stream = nullptr;
 			unsigned int stream_index;
-			for (stream_index = 0; stream_index < av.format->nb_streams; stream_index++)
+			for (stream_index = 0; stream_index < av.format_context->nb_streams; stream_index++)
 			{
-				if (av.format->streams[stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
+				if (av.format_context->streams[stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
 				{
-					stream = av.format->streams[stream_index];
+					stream = av.format_context->streams[stream_index];
 					break;
 				}
 			}
@@ -328,8 +475,8 @@ namespace utils
 			}
 
 			// Find decoder
-			av.codec = avcodec_find_decoder(stream->codecpar->codec_id);
-			if (!av.codec)
+			av.audio.codec = avcodec_find_decoder(stream->codecpar->codec_id);
+			if (!av.audio.codec)
 			{
 				media_log.error("audio_decoder: Failed to find decoder for stream #%u in file '%s'", stream_index, path);
 				has_error = true;
@@ -337,8 +484,8 @@ namespace utils
 			}
 
 			// Allocate context
-			av.context = avcodec_alloc_context3(av.codec);
-			if (!av.context)
+			av.audio.context = avcodec_alloc_context3(av.audio.codec);
+			if (!av.audio.context)
 			{
 				media_log.error("audio_decoder: Failed to allocate context for stream #%u in file '%s'", stream_index, path);
 				has_error = true;
@@ -346,7 +493,7 @@ namespace utils
 			}
 
 			// Open decoder
-			if (int err = avcodec_open2(av.context, av.codec, nullptr); err < 0)
+			if (int err = avcodec_open2(av.audio.context, av.audio.codec, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Failed to open decoder for stream #%u in file '%s'. Error: %d='%s'", stream_index, path, err, av_error_to_string(err));
 				has_error = true;
@@ -389,23 +536,21 @@ namespace utils
 			}
 
 			// Prepare to read data
-			av.frame = av_frame_alloc();
-			if (!av.frame)
+			av.audio.frame = av_frame_alloc();
+			if (!av.audio.frame)
 			{
 				media_log.error("audio_decoder: Error allocating the frame");
 				has_error = true;
 				return;
 			}
 
-			duration_ms = stream->duration / 1000;
-
 			AVPacket* packet = av_packet_alloc();
 			std::unique_ptr<AVPacket, decltype([](AVPacket* p){av_packet_unref(p);})> packet_(packet);
 
 			// Iterate through frames
-			while (thread_ctrl::state() != thread_state::aborting && av_read_frame(av.format, packet) >= 0)
+			while (thread_ctrl::state() != thread_state::aborting && av_read_frame(av.format_context, packet) >= 0)
 			{
-				if (int err = avcodec_send_packet(av.context, packet); err < 0)
+				if (int err = avcodec_send_packet(av.audio.context, packet); err < 0)
 				{
 					media_log.error("audio_decoder: Queuing error: %d='%s'", err, av_error_to_string(err));
 					has_error = true;
@@ -414,7 +559,7 @@ namespace utils
 
 				while (thread_ctrl::state() != thread_state::aborting)
 				{
-					if (int err = avcodec_receive_frame(av.context, av.frame); err < 0)
+					if (int err = avcodec_receive_frame(av.audio.context, av.audio.frame); err < 0)
 					{
 						if (err == AVERROR(EAGAIN) || err == averror_eof)
 							break;
@@ -427,7 +572,7 @@ namespace utils
 					// Resample frames
 					u8* buffer;
 					const int align = 1;
-					const int buffer_size = av_samples_alloc(&buffer, nullptr, dst_channels, av.frame->nb_samples, dst_format, align);
+					const int buffer_size = av_samples_alloc(&buffer, nullptr, dst_channels, av.audio.frame->nb_samples, dst_format, align);
 					if (buffer_size < 0)
 					{
 						media_log.error("audio_decoder: Error allocating buffer: %d='%s'", buffer_size, av_error_to_string(buffer_size));
@@ -435,7 +580,7 @@ namespace utils
 						return;
 					}
 
-					const int frame_count = swr_convert(av.swr, &buffer, av.frame->nb_samples, const_cast<const uint8_t**>(av.frame->data), av.frame->nb_samples);
+					const int frame_count = swr_convert(av.swr, &buffer, av.audio.frame->nb_samples, const_cast<const uint8_t**>(av.audio.frame->data), av.audio.frame->nb_samples);
 					if (frame_count < 0)
 					{
 						media_log.error("audio_decoder: Error converting frame: %d='%s'", frame_count, av_error_to_string(frame_count));
@@ -450,25 +595,10 @@ namespace utils
 						std::scoped_lock lock(m_mtx);
 						data.resize(m_size + buffer_size);
 
-						if (m_swap_endianness)
-						{
-							// The format is float 32bit per channel.
-							const auto write_byteswapped = [](const void* src, void* dst) -> void
-							{
-								*static_cast<f32*>(dst) = *static_cast<const be_t<f32>*>(src);
-							};
+						// The format is float 32bit per channel.
+						copy_samples<f32>(buffer, &data[m_size], buffer_size / sizeof(f32), m_swap_endianness);
 
-							for (size_t i = 0; i < (buffer_size - sizeof(f32)); i += sizeof(f32))
-							{
-								write_byteswapped(buffer + i, data.data() + m_size + i);
-							}
-						}
-						else
-						{
-							memcpy(&data[m_size], buffer, buffer_size);
-						}
-
-						const s64 timestamp_ms = stream->time_base.den ? (1000 * av.frame->best_effort_timestamp * stream->time_base.num) / stream->time_base.den : 0;
+						const s64 timestamp_ms = stream->time_base.den ? (1000 * av.audio.frame->best_effort_timestamp * stream->time_base.num) / stream->time_base.den : 0;
 						timestamps_ms.push_back({m_size, timestamp_ms});
 						m_size += buffer_size;
 					}
@@ -476,7 +606,7 @@ namespace utils
 					if (buffer)
 						av_free(buffer);
 
-					media_log.notice("audio_decoder: decoded frame_count=%d buffer_size=%d timestamp_us=%d", frame_count, buffer_size, av.frame->best_effort_timestamp);
+					media_log.notice("audio_decoder: decoded frame_count=%d buffer_size=%d timestamp_us=%d", frame_count, buffer_size, av.audio.frame->best_effort_timestamp);
 				}
 			}
 		};
@@ -535,7 +665,7 @@ namespace utils
 	}
 
 	video_encoder::video_encoder()
-		: utils::image_sink()
+		: utils::video_sink()
 	{
 	}
 
@@ -549,9 +679,9 @@ namespace utils
 		return m_path;
 	}
 
-	s64 video_encoder::last_pts() const
+	s64 video_encoder::last_video_pts() const
 	{
-		return m_last_pts;
+		return m_last_video_pts;
 	}
 
 	void video_encoder::set_path(const std::string& path)
@@ -594,6 +724,11 @@ namespace utils
 		m_sample_rate = sample_rate;
 	}
 
+	void video_encoder::set_audio_channels(u32 channels)
+	{
+		m_channels = channels;
+	}
+
 	void video_encoder::set_audio_bitrate(u32 bitrate)
 	{
 		m_audio_bitrate_bps = bitrate;
@@ -604,16 +739,6 @@ namespace utils
 		m_audio_codec_id = codec_id;
 	}
 
-	void video_encoder::add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms)
-	{
-		// Do not allow new frames while flushing
-		if (m_flush)
-			return;
-
-		std::lock_guard lock(m_mtx);
-		m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
-	}
-
 	void video_encoder::pause(bool flush)
 	{
 		if (m_thread)
@@ -658,24 +783,33 @@ namespace utils
 
 		std::lock_guard lock(m_mtx);
 		m_frames_to_encode.clear();
+		m_samples_to_encode.clear();
 		has_error = false;
 		m_flush = false;
 		m_paused = false;
 		m_running = false;
 	}
 
+	void video_encoder::resume()
+	{
+		media_log.notice("video_encoder: Resuming video encoder");
+
+		m_flush = false;
+		m_paused = false;
+	}
+
 	void video_encoder::encode()
 	{
 		if (m_running)
 		{
 			// Resume
-			m_flush = false;
-			m_paused = false;
+			resume();
 			media_log.success("video_encoder: resuming recording of '%s'", m_path);
 			return;
 		}
 
-		m_last_pts = 0;
+		m_last_audio_pts = 0;
+		m_last_video_pts = 0;
 
 		stop();
 
@@ -692,7 +826,34 @@ namespace utils
 		{
 			m_running = true;
 
-			// TODO: audio encoding
+			av_log_set_callback([](void* avcl, int level, const char* fmt, va_list vl) -> void
+			{
+				if (level > av_log_get_level())
+				{
+					return;
+				}
+
+				constexpr int line_size = 1024;
+				char line[line_size]{};
+				int print_prefix = 1;
+
+				if (int err = av_log_format_line2(avcl, level, fmt, vl, line, line_size, &print_prefix); err < 0)
+				{
+					media_log.error("av_log: av_log_format_line2 failed. Error: %d='%s'", err, av_error_to_string(err));
+					return;
+				}
+
+				std::string msg = line;
+				fmt::trim_back(msg, "\n\r\t ");
+
+				if (level <= AV_LOG_ERROR)
+					media_log.error("av_log: %s", msg);
+				else if (level <= AV_LOG_WARNING)
+					media_log.warning("av_log: %s", msg);
+				else
+					media_log.notice("av_log: %s", msg);
+			});
+			av_log_set_level(AV_LOG_ERROR);
 
 			// Reset variables at all costs
 			scoped_av av;
@@ -702,140 +863,183 @@ namespace utils
 				m_running = false;
 			};
 
-			const AVPixelFormat out_format = static_cast<AVPixelFormat>(m_out_format.av_pixel_format);
-			const char* av_output_format = nullptr;
-
-			const auto find_format = [&](const AVCodec* codec) -> const char*
+			// Let's list the encoders first
+			std::vector<const AVCodec*> audio_codecs;
+			std::vector<const AVCodec*> video_codecs;
+			void* opaque = nullptr;
+			while (const AVCodec* codec = av_codec_iterate(&opaque))
 			{
-				if (!codec)
-					return nullptr;
+				if (codec->type == AVMediaType::AVMEDIA_TYPE_AUDIO)
+				{
+					media_log.notice("video_encoder: Found audio codec %d = %s", static_cast<int>(codec->id), codec->name);
+					audio_codecs.push_back(codec);
+				}
+				else if (codec->type == AVMediaType::AVMEDIA_TYPE_VIDEO)
+				{
+					media_log.notice("video_encoder: Found video codec %d = %s", static_cast<int>(codec->id), codec->name);
+					video_codecs.push_back(codec);
+				}
+			}
 
+			const AVPixelFormat out_pix_format = static_cast<AVPixelFormat>(m_out_format.av_pixel_format);
+
+			const auto find_format = [&](AVCodecID video_codec, AVCodecID audio_codec) -> const AVOutputFormat*
+			{
 				// Try to find a preferable output format
 				std::vector<const AVOutputFormat*> oformats;
 
 				void* opaque = nullptr;
 				for (const AVOutputFormat* oformat = av_muxer_iterate(&opaque); !!oformat; oformat = av_muxer_iterate(&opaque))
 				{
-					if (avformat_query_codec(oformat, codec->id, FF_COMPLIANCE_STRICT) == 1)
+					media_log.notice("video_encoder: Listing output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(oformat->video_codec), static_cast<int>(oformat->audio_codec));
+					if (avformat_query_codec(oformat, video_codec, FF_COMPLIANCE_NORMAL) == 1 &&
+						avformat_query_codec(oformat, audio_codec, FF_COMPLIANCE_NORMAL) == 1)
 					{
-						media_log.notice("video_encoder: Found output format '%s'", oformat->name);
+						oformats.push_back(oformat);
+					}
+				}
 
-						switch (codec->id)
-						{
-						case AV_CODEC_ID_MPEG4:
-							if (strcmp(oformat->name, "avi") == 0)
-								return oformat->name;
-							break;
-						case AV_CODEC_ID_H264:
-						case AV_CODEC_ID_MJPEG:
-							// TODO
-							break;
-						default:
-							break;
-						}
+				for (const AVOutputFormat* oformat : oformats)
+				{
+					if (!oformat) continue;
+					media_log.notice("video_encoder: Found compatible output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(oformat->video_codec), static_cast<int>(oformat->audio_codec));
+				}
 
-						oformats.push_back(oformat);
+				// Select best match
+				for (const AVOutputFormat* oformat : oformats)
+				{
+					if (oformat && oformat->video_codec == video_codec && oformat->audio_codec == audio_codec)
+					{
+						media_log.notice("video_encoder: Using matching output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(oformat->video_codec), static_cast<int>(oformat->audio_codec));
+						return oformat;
 					}
 				}
 
 				// Fallback to first found format
-				if (!oformats.empty() && oformats.front())
+				if (const AVOutputFormat* oformat = oformats.empty() ? nullptr : oformats.front())
 				{
-					const AVOutputFormat* oformat = oformats.front();
-					media_log.notice("video_encoder: Falling back to output format '%s'", oformat->name);
-					return oformat->name;
+					media_log.notice("video_encoder: Using suboptimal output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(oformat->video_codec), static_cast<int>(oformat->audio_codec));
+					return oformat;
 				}
 
 				return nullptr;
 			};
 
-			AVCodecID used_codec = static_cast<AVCodecID>(m_video_codec_id);
+			const AVCodecID video_codec = static_cast<AVCodecID>(m_video_codec_id);
+			const AVCodecID audio_codec = static_cast<AVCodecID>(m_audio_codec_id);
+			const AVOutputFormat* out_format = find_format(video_codec, audio_codec);
 
-			// Find specified codec first
-			if (const AVCodec* encoder = avcodec_find_encoder(used_codec); !!encoder)
+			if (out_format)
 			{
-				media_log.success("video_encoder: Found requested video_codec %d = %s", static_cast<int>(used_codec), encoder->name);
-				av_output_format = find_format(encoder);
-
-				if (av_output_format)
-				{
-					media_log.success("video_encoder: Found requested output format '%s'", av_output_format);
-				}
-				else
-				{
-					media_log.error("video_encoder: Could not find a format for the requested video_codec %d = %s", static_cast<int>(used_codec), encoder->name);
-				}
+				media_log.success("video_encoder: Found requested output format '%s'", out_format->name);
 			}
 			else
 			{
-				media_log.error("video_encoder: Could not find requested video_codec %d", static_cast<int>(used_codec));
-			}
+				media_log.error("video_encoder: Could not find a format for the requested video_codec %d and audio_codec %d", m_video_codec_id, m_audio_codec_id);
 
-			// Fallback to some other codec
-			if (!av_output_format)
-			{
-				void* opaque = nullptr;
-				for (const AVCodec* codec = av_codec_iterate(&opaque); !!codec; codec = av_codec_iterate(&opaque))
+				// Fallback to some other codec
+				for (const AVCodec* video_codec : video_codecs)
 				{
-					if (av_codec_is_encoder(codec))
+					for (const AVCodec* audio_codec : audio_codecs)
 					{
-						media_log.notice("video_encoder: Found video_codec %d = %s", static_cast<int>(codec->id), codec->name);
-						av_output_format = find_format(codec);
+						out_format = find_format(video_codec->id, audio_codec->id);
 
-						if (av_output_format)
+						if (out_format)
 						{
-							media_log.success("video_encoder: Found fallback output format '%s'", av_output_format);
+							media_log.success("video_encoder: Found fallback output format '%s'", out_format->name);
 							break;
 						}
 					}
+
+					if (out_format)
+					{
+						break;
+					}
 				}
 			}
 
-			if (!av_output_format)
+			if (!out_format)
 			{
 				media_log.error("video_encoder: Could not find any output format");
 				has_error = true;
 				return;
 			}
 
-			if (int err = avformat_alloc_output_context2(&av.format, nullptr, av_output_format, path.c_str()); err < 0)
+			if (int err = avformat_alloc_output_context2(&av.format_context, out_format, nullptr, nullptr); err < 0)
 			{
-				media_log.error("video_encoder: avformat_alloc_output_context2 failed. Error: %d='%s'", err, av_error_to_string(err));
+				media_log.error("video_encoder: avformat_alloc_output_context2 for '%s' failed. Error: %d='%s'", out_format->name, err, av_error_to_string(err));
 				has_error = true;
 				return;
 			}
 
-			if (!av.format)
+			if (!av.format_context)
 			{
 				media_log.error("video_encoder: avformat_alloc_output_context2 failed");
 				has_error = true;
 				return;
 			}
 
-			if (!(av.codec = avcodec_find_encoder(av.format->oformat->video_codec)))
+			const auto create_context = [this, &av](bool is_video) -> bool
 			{
-				media_log.error("video_encoder: avcodec_find_encoder failed");
-				has_error = true;
-				return;
-			}
+				const std::string type = is_video ? "video" : "audio";
+				scoped_av::ctx& ctx = is_video ? av.video : av.audio;
+
+				if (is_video)
+				{
+					if (!(ctx.codec = avcodec_find_encoder(av.format_context->oformat->video_codec)))
+					{
+						media_log.error("video_encoder: avcodec_find_encoder for video failed. video_codec=%d", static_cast<int>(av.format_context->oformat->video_codec));
+						return false;
+					}
+				}
+				else
+				{
+					if (!(ctx.codec = avcodec_find_encoder(av.format_context->oformat->audio_codec)))
+					{
+						media_log.error("video_encoder: avcodec_find_encoder for audio failed. audio_codec=%d", static_cast<int>(av.format_context->oformat->audio_codec));
+						return false;
+					}
+				}
+
+				if (!(ctx.stream = avformat_new_stream(av.format_context, nullptr)))
+				{
+					media_log.error("video_encoder: avformat_new_stream for %s failed", type);
+					return false;
+				}
+
+				ctx.stream->id = is_video ? 0 : 1;
+
+				if (!(ctx.context = avcodec_alloc_context3(ctx.codec)))
+				{
+					media_log.error("video_encoder: avcodec_alloc_context3 for %s failed", type);
+					return false;
+				}
+
+				if (av.format_context->oformat->flags & AVFMT_GLOBALHEADER)
+				{
+					ctx.context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+				}
+
+				return true;
+			};
 
-			if (!(av.stream = avformat_new_stream(av.format, nullptr)))
+			if (!create_context(true))
 			{
-				media_log.error("video_encoder: avformat_new_stream failed");
 				has_error = true;
 				return;
 			}
 
-			av.stream->id = static_cast<int>(av.format->nb_streams - 1);
-
-			if (!(av.context = avcodec_alloc_context3(av.codec)))
+			if (!create_context(false))
 			{
-				media_log.error("video_encoder: avcodec_alloc_context3 failed");
 				has_error = true;
 				return;
 			}
 
-			media_log.notice("video_encoder: using video_codec = %d", static_cast<int>(av.format->oformat->video_codec));
+			media_log.notice("video_encoder: using audio_codec = %d", static_cast<int>(av.format_context->oformat->audio_codec));
+			media_log.notice("video_encoder: using sample_rate = %d", m_sample_rate);
+			media_log.notice("video_encoder: using audio_bitrate = %d", m_audio_bitrate_bps);
+			media_log.notice("video_encoder: using audio channels = %d", m_channels);
+			media_log.notice("video_encoder: using video_codec = %d", static_cast<int>(av.format_context->oformat->video_codec));
 			media_log.notice("video_encoder: using video_bitrate = %d", m_video_bitrate_bps);
 			media_log.notice("video_encoder: using out width = %d", m_out_format.width);
 			media_log.notice("video_encoder: using out height = %d", m_out_format.height);
@@ -843,67 +1047,186 @@ namespace utils
 			media_log.notice("video_encoder: using gop_size = %d", m_gop_size);
 			media_log.notice("video_encoder: using max_b_frames = %d", m_max_b_frames);
 
-			av.context->codec_id = av.format->oformat->video_codec;
-			av.context->bit_rate = m_video_bitrate_bps;
-			av.context->width = static_cast<int>(m_out_format.width);
-			av.context->height = static_cast<int>(m_out_format.height);
-			av.context->time_base = {.num = 1, .den = static_cast<int>(m_framerate)};
-			av.context->framerate = {.num = static_cast<int>(m_framerate), .den = 1};
-			av.context->pix_fmt = out_format;
-			av.context->gop_size = m_gop_size;
-			av.context->max_b_frames = m_max_b_frames;
-
-			if (av.format->oformat->flags & AVFMT_GLOBALHEADER)
+			// select audio parameters supported by the encoder
+			if (av.audio.context)
 			{
-				av.context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-			}
+				if (const AVChannelLayout* ch_layout = select_channel_layout(av.audio.codec, m_channels))
+				{
+					media_log.notice("video_encoder: found channel layout '%s' with %d channels", channel_layout_name(*ch_layout), ch_layout->nb_channels);
 
-			if (int err = avcodec_open2(av.context, av.codec, nullptr); err != 0)
-			{
-				media_log.error("video_encoder: avcodec_open2 failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
+					if (int err = av_channel_layout_copy(&av.audio.context->ch_layout, ch_layout); err != 0)
+					{
+						media_log.error("video_encoder: av_channel_layout_copy failed. Error: %d='%s'", err, av_error_to_string(err));
+						has_error = true;
+						return;
+					}
+				}
+				else
+				{
+					media_log.notice("video_encoder: select_channel_layout returned nullptr, trying with own layout...");
+
+					const AVChannelLayout new_ch_layout = get_preferred_channel_layout(m_channels);
+
+					if (memcmp(&new_ch_layout, &empty_ch_layout, sizeof(AVChannelLayout)) == 0)
+					{
+						media_log.error("video_encoder: unsupported audio channel count: %d", m_channels);
+						has_error = true;
+						return;
+					}
+
+					if (int err = av_channel_layout_copy(&av.audio.context->ch_layout, &new_ch_layout); err != 0)
+					{
+						media_log.error("video_encoder: av_channel_layout_copy failed. Error: %d='%s'", err, av_error_to_string(err));
+						has_error = true;
+						return;
+					}
+				}
+
+				m_sample_rate = select_sample_rate(av.audio.codec);
+
+				av.audio.context->codec_id = av.format_context->oformat->audio_codec;
+				av.audio.context->codec_type = AVMEDIA_TYPE_AUDIO;
+				av.audio.context->bit_rate = m_audio_bitrate_bps;
+				av.audio.context->sample_rate = m_sample_rate;
+				av.audio.context->time_base = {.num = 1, .den = av.audio.context->sample_rate};
+				av.audio.context->sample_fmt = AV_SAMPLE_FMT_FLTP; // AV_SAMPLE_FMT_FLT is not supported in regular AC3
+				av.audio.stream->time_base = av.audio.context->time_base;
+
+				// check that the encoder supports the format
+				if (!check_sample_fmt(av.audio.codec, av.audio.context->sample_fmt))
+				{
+					media_log.error("video_encoder: Audio encoder does not support sample format %s", av_get_sample_fmt_name(av.audio.context->sample_fmt));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_open2(av.audio.context, av.audio.codec, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: avcodec_open2 for audio failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (!(av.audio.packet = av_packet_alloc()))
+				{
+					media_log.error("video_encoder: av_packet_alloc for audio packet failed");
+					has_error = true;
+					return;
+				}
+
+				if (!(av.audio.frame = av_frame_alloc()))
+				{
+					media_log.error("video_encoder: av_frame_alloc for audio frame failed");
+					has_error = true;
+					return;
+				}
+
+				av.audio.frame->format = AV_SAMPLE_FMT_FLTP;
+				av.audio.frame->nb_samples = av.audio.context->frame_size;
+
+				if (int err = av_channel_layout_copy(&av.audio.frame->ch_layout, &av.audio.context->ch_layout); err < 0)
+				{
+					media_log.error("video_encoder: av_channel_layout_copy for audio frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = av_frame_get_buffer(av.audio.frame, 0); err < 0)
+				{
+					media_log.error("video_encoder: av_frame_get_buffer for audio frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_parameters_from_context(av.audio.stream->codecpar, av.audio.context); err < 0)
+				{
+					media_log.error("video_encoder: avcodec_parameters_from_context for audio failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				// Log channel layout
+				media_log.notice("video_encoder: av_channel_layout='%s'", channel_layout_name(av.audio.frame->ch_layout));
 			}
 
-			if (!(av.frame = av_frame_alloc()))
+			// select video parameters supported by the encoder
+			if (av.video.context)
 			{
-				media_log.error("video_encoder: av_frame_alloc failed");
-				has_error = true;
-				return;
-			}
+				av.video.context->codec_id = av.format_context->oformat->video_codec;
+				av.video.context->codec_type = AVMEDIA_TYPE_VIDEO;
+				av.video.context->frame_number = 0;
+				av.video.context->bit_rate = m_video_bitrate_bps;
+				av.video.context->width = static_cast<int>(m_out_format.width);
+				av.video.context->height = static_cast<int>(m_out_format.height);
+				av.video.context->time_base = {.num = 1, .den = static_cast<int>(m_framerate)};
+				av.video.context->framerate = {.num = static_cast<int>(m_framerate), .den = 1};
+				av.video.context->pix_fmt = out_pix_format;
+				av.video.context->gop_size = m_gop_size;
+				av.video.context->max_b_frames = m_max_b_frames;
+				av.video.stream->time_base = av.video.context->time_base;
+
+				if (int err = avcodec_open2(av.video.context, av.video.codec, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: avcodec_open2 for video failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 
-			av.frame->format = av.context->pix_fmt;
-			av.frame->width = av.context->width;
-			av.frame->height = av.context->height;
+				if (!(av.video.packet = av_packet_alloc()))
+				{
+					media_log.error("video_encoder: av_packet_alloc for video packet failed");
+					has_error = true;
+					return;
+				}
 
-			if (int err = av_frame_get_buffer(av.frame, 32); err < 0)
-			{
-				media_log.error("video_encoder: av_frame_get_buffer failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
+				if (!(av.video.frame = av_frame_alloc()))
+				{
+					media_log.error("video_encoder: av_frame_alloc for video frame failed");
+					has_error = true;
+					return;
+				}
+
+				av.video.frame->format = av.video.context->pix_fmt;
+				av.video.frame->width = av.video.context->width;
+				av.video.frame->height = av.video.context->height;
+
+				if (int err = av_frame_get_buffer(av.video.frame, 0); err < 0)
+				{
+					media_log.error("video_encoder: av_frame_get_buffer for video frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_parameters_from_context(av.video.stream->codecpar, av.video.context); err < 0)
+				{
+					media_log.error("video_encoder: avcodec_parameters_from_context for video failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 			}
 
-			if (int err = avcodec_parameters_from_context(av.stream->codecpar, av.context); err < 0)
+			media_log.notice("video_encoder: av_dump_format");
+			for (u32 i = 0; i < av.format_context->nb_streams; i++)
 			{
-				media_log.error("video_encoder: avcodec_parameters_from_context failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
+				av_dump_format(av.format_context, i, path.c_str(), 1);
 			}
 
-			av_dump_format(av.format, 0, path.c_str(), 1);
-
-			if (int err = avio_open(&av.format->pb, path.c_str(), AVIO_FLAG_WRITE); err != 0)
+			// open the output file, if needed
+			if (!(av.format_context->flags & AVFMT_NOFILE))
 			{
-				media_log.error("video_encoder: avio_open failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
+				if (int err = avio_open(&av.format_context->pb, path.c_str(), AVIO_FLAG_WRITE); err != 0)
+				{
+					media_log.error("video_encoder: avio_open failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 			}
 
-			if (int err = avformat_write_header(av.format, nullptr); err < 0)
+			if (int err = avformat_write_header(av.format_context, nullptr); err < 0)
 			{
 				media_log.error("video_encoder: avformat_write_header failed. Error: %d='%s'", err, av_error_to_string(err));
 
-				if (int err = avio_close(av.format->pb); err != 0)
+				if (int err = avio_close(av.format_context->pb); err != 0)
 				{
 					media_log.error("video_encoder: avio_close failed. Error: %d='%s'", err, av_error_to_string(err));
 				}
@@ -912,21 +1235,11 @@ namespace utils
 				return;
 			}
 
-			const auto flush = [&]()
+			const auto flush = [&](scoped_av::ctx& ctx)
 			{
-				while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error)
+				while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error && ctx.context)
 				{
-					AVPacket* packet = av_packet_alloc();
-					std::unique_ptr<AVPacket, decltype([](AVPacket* p){ if (p) av_packet_unref(p); })> packet_(packet);
-
-					if (!packet)
-					{
-						media_log.error("video_encoder: av_packet_alloc failed");
-						has_error = true;
-						return;
-					}
-
-					if (int err = avcodec_receive_packet(av.context, packet); err < 0)
+					if (int err = avcodec_receive_packet(ctx.context, ctx.packet); err < 0)
 					{
 						if (err == AVERROR(EAGAIN) || err == averror_eof)
 							break;
@@ -936,133 +1249,363 @@ namespace utils
 						return;
 					}
 
-					av_packet_rescale_ts(packet, av.context->time_base, av.stream->time_base);
-					packet->stream_index = av.stream->index;
+					av_packet_rescale_ts(ctx.packet, ctx.context->time_base, ctx.stream->time_base);
+					ctx.packet->stream_index = ctx.stream->index;
 
-					if (int err = av_interleaved_write_frame(av.format, packet); err < 0)
+					if (int err = av_interleaved_write_frame(av.format_context, ctx.packet); err < 0)
 					{
-						media_log.error("video_encoder: av_interleaved_write_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+						media_log.error("video_encoder: av_write_frame failed. Error: %d='%s'", err, av_error_to_string(err));
 						has_error = true;
 						return;
 					}
 				}
 			};
 
-			s64 last_pts = -1;
+			u32 audio_sample_remainder = 0;
+			s64 last_audio_pts = -1;
+			s64 last_audio_frame_pts = 0;
+			s64 last_video_pts = -1;
+
+			// Allocate audio buffer for our audio frame
+			std::vector<u8> audio_frame;
+			u32 audio_frame_sample_count = 0;
+			const bool sample_fmt_is_planar = av.audio.context && av_sample_fmt_is_planar(av.audio.context->sample_fmt) != 0;
+			const int sample_fmt_bytes = av.audio.context ? av_get_bytes_per_sample(av.audio.context->sample_fmt) : 0;
+			ensure(sample_fmt_bytes == sizeof(f32)); // We only support FLT or FLTP for now
+
+			if (av.audio.frame)
+			{
+				audio_frame.resize(av.audio.frame->nb_samples * av.audio.frame->ch_layout.nb_channels * sizeof(f32));
+				last_audio_frame_pts -= av.audio.frame->nb_samples;
+			}
+
+			encoder_sample last_samples;
+			u32 leftover_sample_count = 0;
 
 			while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error)
 			{
+				// Fetch video frame
 				encoder_frame frame_data;
+				bool got_frame = false;
 				{
 					m_mtx.lock();
 
 					if (m_frames_to_encode.empty())
 					{
 						m_mtx.unlock();
+					}
+					else
+					{
+						frame_data = std::move(m_frames_to_encode.front());
+						m_frames_to_encode.pop_front();
+						m_mtx.unlock();
+
+						got_frame = true;
+
+						// Calculate presentation timestamp.
+						const s64 pts = get_pts(frame_data.timestamp_ms);
 
-						if (m_flush)
+						// We need to skip this frame if it has the same timestamp.
+						if (pts <= last_video_pts)
 						{
-							m_flush = false;
+							media_log.trace("video_encoder: skipping frame. last_pts=%d, pts=%d, timestamp_ms=%d", last_video_pts, pts, frame_data.timestamp_ms);
+						}
+						else if (av.video.context)
+						{
+							media_log.trace("video_encoder: adding new frame. timestamp_ms=%d", frame_data.timestamp_ms);
 
-							if (!m_paused)
+							if (int err = av_frame_make_writable(av.video.frame); err < 0)
 							{
-								// We only stop the thread after a flush if we are not paused
+								media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
 								break;
 							}
-						}
 
-						// We only actually pause after we process all frames
-						const u64 sleeptime = m_paused ? 10000 : 1;
-						thread_ctrl::wait_for(sleeptime);
-						continue;
-					}
+							u8* in_data[4]{};
+							int in_line[4]{};
 
-					frame_data = std::move(m_frames_to_encode.front());
-					m_frames_to_encode.pop_front();
+							const AVPixelFormat in_format = static_cast<AVPixelFormat>(frame_data.av_pixel_format);
 
-					m_mtx.unlock();
+							if (int ret = av_image_fill_linesizes(in_line, in_format, frame_data.width); ret < 0)
+							{
+								fmt::throw_exception("video_encoder: av_image_fill_linesizes failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
+							}
 
-					media_log.trace("video_encoder: adding new frame. timestamp=%d", frame_data.timestamp_ms);
-				}
+							if (int ret = av_image_fill_pointers(in_data, in_format, frame_data.height, frame_data.data.data(), in_line); ret < 0)
+							{
+								fmt::throw_exception("video_encoder: av_image_fill_pointers failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
+							}
 
-				// Calculate presentation timestamp.
-				const s64 pts = get_pts(frame_data.timestamp_ms);
+							// Update the context in case the frame format has changed
+							av.sws = sws_getCachedContext(av.sws, frame_data.width, frame_data.height, in_format,
+							                              av.video.context->width, av.video.context->height, out_pix_format, SWS_BICUBIC, nullptr, nullptr, nullptr);
+							if (!av.sws)
+							{
+								media_log.error("video_encoder: sws_getCachedContext failed");
+								has_error = true;
+								break;
+							}
 
-				// We need to skip this frame if it has the same timestamp.
-				if (pts <= last_pts)
-				{
-					media_log.notice("video_encoder: skipping frame. last_pts=%d, pts=%d", last_pts, pts);
-					continue;
-				}
+							if (int err = sws_scale(av.sws, in_data, in_line, 0, frame_data.height, av.video.frame->data, av.video.frame->linesize); err < 0)
+							{
+								media_log.error("video_encoder: sws_scale failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
+								break;
+							}
 
-				if (int err = av_frame_make_writable(av.frame); err < 0)
-				{
-					media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
-				}
+							av.video.frame->pts = pts;
 
-				u8* in_data[4]{};
-				int in_line[4]{};
+							if (int err = avcodec_send_frame(av.video.context, av.video.frame); err < 0)
+							{
+								media_log.error("video_encoder: avcodec_send_frame for video failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
+								break;
+							}
 
-				const AVPixelFormat in_format = static_cast<AVPixelFormat>(frame_data.av_pixel_format);
+							flush(av.video);
 
-				if (int ret = av_image_fill_linesizes(in_line, in_format, frame_data.width); ret < 0)
-				{
-					fmt::throw_exception("video_encoder: av_image_fill_linesizes failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
+							last_video_pts = av.video.frame->pts;
+							m_last_video_pts = last_video_pts;
+						}
+					}
 				}
 
-				if (int ret = av_image_fill_pointers(in_data, in_format, frame_data.height, frame_data.data.data(), in_line); ret < 0)
+				// Fetch audio sample
+				encoder_sample sample_data;
+				bool got_sample = false;
 				{
-					fmt::throw_exception("video_encoder: av_image_fill_pointers failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
-				}
+					m_audio_mtx.lock();
 
-				// Update the context in case the frame format has changed
-				av.sws = sws_getCachedContext(av.sws, frame_data.width, frame_data.height, in_format,
-				                              av.context->width, av.context->height, out_format, SWS_BICUBIC, nullptr, nullptr, nullptr);
-				if (!av.sws)
-				{
-					media_log.error("video_encoder: sws_getCachedContext failed");
-					has_error = true;
-					break;
-				}
+					if (m_samples_to_encode.empty())
+					{
+						m_audio_mtx.unlock();
+					}
+					else
+					{
+						sample_data = std::move(m_samples_to_encode.front());
+						m_samples_to_encode.pop_front();
+						m_audio_mtx.unlock();
 
-				if (int err = sws_scale(av.sws, in_data, in_line, 0, frame_data.height, av.frame->data, av.frame->linesize); err < 0)
-				{
-					media_log.error("video_encoder: sws_scale failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
-				}
+						got_sample = true;
 
-				av.frame->pts = pts;
+						if (sample_data.channels != av.audio.frame->ch_layout.nb_channels)
+						{
+							fmt::throw_exception("video_encoder: Audio sample channel count %d does not match frame channel count %d", sample_data.channels, av.audio.frame->ch_layout.nb_channels);
+						}
 
-				if (int err = avcodec_send_frame(av.context, av.frame); err < 0)
-				{
-					media_log.error("video_encoder: avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
+						// Calculate presentation timestamp.
+						const s64 pts = get_audio_pts(sample_data.timestamp_us);
+
+						// We need to skip this frame if it has the same timestamp.
+						if (pts <= last_audio_pts)
+						{
+							media_log.trace("video_encoder: skipping sample. last_pts=%d, pts=%d, timestamp_us=%d", last_audio_pts, pts, sample_data.timestamp_us);
+						}
+						else if (av.audio.context)
+						{
+							media_log.trace("video_encoder: adding new sample. timestamp_us=%d", sample_data.timestamp_us);
+
+							static constexpr bool swap_endianness = false;
+
+							const auto send_frame = [&]()
+							{
+								if (audio_frame_sample_count < static_cast<u32>(av.audio.frame->nb_samples))
+								{
+									return;
+								}
+
+								audio_frame_sample_count = 0;
+
+								if (int err = av_frame_make_writable(av.audio.frame); err < 0)
+								{
+									media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
+									has_error = true;
+									return;
+								}
+
+								// NOTE: The ffmpeg channel layout should match our downmix channel layout
+								if (sample_fmt_is_planar)
+								{
+									const int channels = av.audio.frame->ch_layout.nb_channels;
+									const int samples = av.audio.frame->nb_samples;
+
+									for (int ch = 0; ch < channels; ch++)
+									{
+										f32* dst = reinterpret_cast<f32*>(av.audio.frame->data[ch]);
+
+										for (int sample = 0; sample < samples; sample++)
+										{
+											dst[sample] = *reinterpret_cast<f32*>(&audio_frame[(sample * channels + ch) * sizeof(f32)]);
+										}
+									}
+								}
+								else
+								{
+									std::memcpy(av.audio.frame->data[0], audio_frame.data(), audio_frame.size());
+								}
+
+								av.audio.frame->pts = last_audio_frame_pts + av.audio.frame->nb_samples;
+
+								if (int err = avcodec_send_frame(av.audio.context, av.audio.frame); err < 0)
+								{
+									media_log.error("video_encoder: avcodec_send_frame failed: %d='%s'", err, av_error_to_string(err));
+									has_error = true;
+									return;
+								}
+
+								flush(av.audio);
+
+								last_audio_frame_pts = av.audio.frame->pts;
+							};
+
+							const auto add_encoder_sample = [&](bool add_new_sample, u32 silence_to_add = 0)
+							{
+								const auto update_last_pts = [&](u32 samples_to_add)
+								{
+									const u32 sample_count = audio_sample_remainder + samples_to_add;
+									const u32 pts_to_add = sample_count / m_samples_per_block;
+									audio_sample_remainder = sample_count % m_samples_per_block;
+									last_audio_pts += pts_to_add;
+								};
+
+								// Copy as many old samples to our audio frame as possible
+								if (leftover_sample_count > 0)
+								{
+									const u32 samples_to_add = std::min(leftover_sample_count, av.audio.frame->nb_samples - audio_frame_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										const u8* src = &last_samples.data[(last_samples.sample_count - leftover_sample_count) * last_samples.channels * sizeof(f32)];
+										u8* dst = &audio_frame[audio_frame_sample_count * last_samples.channels * sizeof(f32)];
+										copy_samples<f32>(src, dst, samples_to_add * last_samples.channels, swap_endianness);
+										audio_frame_sample_count += samples_to_add;
+										leftover_sample_count -= samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+
+									if (samples_to_add < leftover_sample_count)
+									{
+										media_log.error("video_encoder: audio frame buffer is already filled entirely by last sample package...");
+									}
+								}
+								else if (silence_to_add > 0)
+								{
+									const u32 samples_to_add = std::min<s32>(silence_to_add, av.audio.frame->nb_samples - audio_frame_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										u8* dst = &audio_frame[audio_frame_sample_count * av.audio.frame->ch_layout.nb_channels * sizeof(f32)];
+										std::memset(dst, 0, samples_to_add * sample_data.channels * sizeof(f32));
+										audio_frame_sample_count += samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+								}
+								else if (add_new_sample)
+								{
+									// Copy as many new samples to our audio frame as possible
+									const u32 samples_to_add = std::min<s32>(sample_data.sample_count, av.audio.frame->nb_samples - audio_frame_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										const u8* src = sample_data.data.data();
+										u8* dst = &audio_frame[audio_frame_sample_count * sample_data.channels * sizeof(f32)];
+										copy_samples<f32>(src, dst, samples_to_add * sample_data.channels, swap_endianness);
+										audio_frame_sample_count += samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+
+									if (samples_to_add < sample_data.sample_count)
+									{
+										// Save this sample package for the next loop if it wasn't fully used.
+										leftover_sample_count = sample_data.sample_count - samples_to_add;
+									}
+									else
+									{
+										// Mark this sample package as fully used.
+										leftover_sample_count = 0;
+									}
+
+									last_samples = std::move(sample_data);
+								}
+
+								send_frame();
+							};
+
+							for (u32 sample = 0; !has_error;)
+							{
+								if (leftover_sample_count > 0)
+								{
+									// Add leftover samples
+									add_encoder_sample(false);
+								}
+								else if (pts > (last_audio_pts + 1))
+								{
+									// Add silence to fill the gap
+									const u32 silence_to_add = static_cast<u32>(pts - (last_audio_pts + 1));
+									add_encoder_sample(false, silence_to_add);
+								}
+								else if (sample == 0)
+								{
+									// Add new samples
+									add_encoder_sample(true);
+									sample++;
+								}
+								else
+								{
+									break;
+								}
+							}
+
+							m_last_audio_pts = last_audio_pts;
+						}
+					}
 				}
 
-				flush();
+				if (!got_frame && !got_sample)
+				{
+					if (m_flush)
+					{
+						m_flush = false;
+
+						if (!m_paused)
+						{
+							// We only stop the thread after a flush if we are not paused
+							break;
+						}
+					}
 
-				last_pts = av.frame->pts;
+					// We only actually pause after we process all frames
+					const u64 sleeptime_us = m_paused ? 10000 : 1;
+					thread_ctrl::wait_for(sleeptime_us);
+					continue;
+				}
+			}
 
-				m_last_pts = last_pts;
+			if (av.video.context)
+			{
+				if (int err = avcodec_send_frame(av.video.context, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				}
 			}
 
-			if (int err = avcodec_send_frame(av.context, nullptr); err != 0)
+			if (av.audio.context)
 			{
-				media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				if (int err = avcodec_send_frame(av.audio.context, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				}
 			}
 
-			flush();
+			flush(av.video);
+			flush(av.audio);
 
-			if (int err = av_write_trailer(av.format); err != 0)
+			if (int err = av_write_trailer(av.format_context); err != 0)
 			{
 				media_log.error("video_encoder: av_write_trailer failed. Error: %d='%s'", err, av_error_to_string(err));
 			}
 
-			if (int err = avio_close(av.format->pb); err != 0)
+			if (int err = avio_close(av.format_context->pb); err != 0)
 			{
 				media_log.error("video_encoder: avio_close failed. Error: %d='%s'", err, av_error_to_string(err));
 			}
diff --git a/rpcs3/util/media_utils.h b/rpcs3/util/media_utils.h
index 2718a8061765..9666e53d8644 100644
--- a/rpcs3/util/media_utils.h
+++ b/rpcs3/util/media_utils.h
@@ -73,10 +73,9 @@ namespace utils
 		u32 set_next_index(bool next);
 
 		shared_mutex m_mtx;
-		const s32 sample_rate = 48000;
+		static constexpr s32 sample_rate = 48000;
 		std::vector<u8> data;
 		atomic_t<u64> m_size = 0;
-		atomic_t<u64> duration_ms = 0;
 		atomic_t<u32> track_fully_decoded{0};
 		atomic_t<u32> track_fully_consumed{0};
 		atomic_t<bool> has_error{false};
@@ -88,7 +87,7 @@ namespace utils
 		std::unique_ptr<named_thread<std::function<void()>>> m_thread;
 	};
 
-	class video_encoder : public utils::image_sink
+	class video_encoder : public utils::video_sink
 	{
 	public:
 		video_encoder();
@@ -108,7 +107,7 @@ namespace utils
 		};
 
 		std::string path() const;
-		s64 last_pts() const;
+		s64 last_video_pts() const;
 
 		void set_path(const std::string& path);
 		void set_framerate(u32 framerate);
@@ -118,32 +117,33 @@ namespace utils
 		void set_max_b_frames(s32 max_b_frames);
 		void set_gop_size(s32 gop_size);
 		void set_sample_rate(u32 sample_rate);
+		void set_audio_channels(u32 channels);
 		void set_audio_bitrate(u32 bitrate);
 		void set_audio_codec(s32 codec_id);
-		void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) override;
-		void pause(bool flush = true);
+		void pause(bool flush = true) override;
 		void stop(bool flush = true) override;
+		void resume() override;
 		void encode();
 
 	private:
 		std::string m_path;
-		s64 m_last_pts = 0;
+		s64 m_last_audio_pts = 0;
+		s64 m_last_video_pts = 0;
 
 		// Thread control
 		std::unique_ptr<named_thread<std::function<void()>>> m_thread;
 		atomic_t<bool> m_running = false;
-		atomic_t<bool> m_paused = false;
 
 		// Video parameters
 		u32 m_video_bitrate_bps = 0;
-		s32 m_video_codec_id = 12; // AV_CODEC_ID_MPEG4;
+		s32 m_video_codec_id = 12; // AV_CODEC_ID_MPEG4
 		s32 m_max_b_frames = 2;
 		s32 m_gop_size = 12;
 		frame_format m_out_format{};
 
 		// Audio parameters
-		u32 m_sample_rate = 48000;
-		u32 m_audio_bitrate_bps = 96000;
+		u32 m_channels = 2;
+		u32 m_audio_bitrate_bps = 320000;
 		s32 m_audio_codec_id = 86018; // AV_CODEC_ID_AAC
 	};
 }
diff --git a/rpcs3/util/video_provider.cpp b/rpcs3/util/video_provider.cpp
index d919137733b6..a5888daddef2 100644
--- a/rpcs3/util/video_provider.cpp
+++ b/rpcs3/util/video_provider.cpp
@@ -34,37 +34,37 @@ namespace utils
 		g_recording_mode = recording_mode::stopped;
 	}
 
-	bool video_provider::set_image_sink(std::shared_ptr<image_sink> sink, recording_mode type)
+	bool video_provider::set_video_sink(std::shared_ptr<video_sink> sink, recording_mode type)
 	{
-		media_log.notice("video_provider: setting new image sink. sink=%d, type=%s", !!sink, type);
+		media_log.notice("video_provider: setting new video sink. sink=%d, type=%s", !!sink, type);
 
 		if (type == recording_mode::stopped)
 		{
 			// Prevent misuse. type is supposed to be a valid state.
-			media_log.error("video_provider: cannot set image sink with type %s", type);
+			media_log.error("video_provider: cannot set video sink with type %s", type);
 			return false;
 		}
 
 		std::lock_guard lock(m_mutex);
 
-		if (m_image_sink)
+		if (m_video_sink)
 		{
 			// cell has preference
 			if (m_type == recording_mode::cell && m_type != type)
 			{
-				media_log.warning("video_provider: cannot set image sink with type %s if type %s is active", type, m_type);
+				media_log.warning("video_provider: cannot set video sink with type %s if type %s is active", type, m_type);
 				return false;
 			}
 
-			if (m_type != type || m_image_sink != sink)
+			if (m_type != type || m_video_sink != sink)
 			{
-				media_log.warning("video_provider: stopping current image sink of type %s", m_type);
-				m_image_sink->stop();
+				media_log.warning("video_provider: stopping current video sink of type %s", m_type);
+				m_video_sink->stop();
 			}
 		}
 
 		m_type = sink ? type : recording_mode::stopped;
-		m_image_sink = sink;
+		m_video_sink = sink;
 
 		if (m_type == recording_mode::stopped)
 		{
@@ -74,64 +74,132 @@ namespace utils
 		return true;
 	}
 
-	void video_provider::set_pause_time(usz pause_time_ms)
+	void video_provider::set_pause_time_us(usz pause_time_us)
 	{
 		std::lock_guard lock(m_mutex);
-		m_pause_time_ms = pause_time_ms;
+		m_pause_time_us = pause_time_us;
+	}
+
+	recording_mode video_provider::check_mode()
+	{
+		if (!m_video_sink || m_video_sink->has_error)
+		{
+			g_recording_mode = recording_mode::stopped;
+			rsx::overlays::queue_message(localized_string_id::RECORDING_ABORTED);
+		}
+
+		if (g_recording_mode == recording_mode::stopped)
+		{
+			m_active = false;
+			return g_recording_mode;
+		}
+
+		if (!m_active.exchange(true))
+		{
+			m_current_encoder_frame = 0;
+			m_current_encoder_sample = 0;
+			m_last_video_pts_incoming = -1;
+			m_last_audio_pts_incoming = -1;
+		}
+
+		if (m_current_encoder_frame == 0 && m_current_encoder_sample == 0)
+		{
+			m_encoder_start = steady_clock::now();
+		}
+
+		return g_recording_mode;
 	}
 
 	bool video_provider::can_consume_frame()
 	{
 		std::lock_guard lock(m_mutex);
 
-		if (!m_image_sink)
+		if (!m_video_sink || !m_video_sink->use_internal_video)
 			return false;
 
-		const usz timestamp_ms = std::chrono::duration_cast<std::chrono::milliseconds>(steady_clock::now() - m_encoder_start).count() - m_pause_time_ms;
-		const s64 pts = m_image_sink->get_pts(timestamp_ms);
-		return pts > m_last_pts_incoming;
+		const usz elapsed_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count();
+		ensure(elapsed_us >= m_pause_time_us);
+
+		const usz timestamp_ms = (elapsed_us - m_pause_time_us) / 1000;
+		const s64 pts = m_video_sink->get_pts(timestamp_ms);
+		return pts > m_last_video_pts_incoming;
 	}
 
 	void video_provider::present_frame(std::vector<u8>& data, u32 pitch, u32 width, u32 height, bool is_bgra)
 	{
 		std::lock_guard lock(m_mutex);
 
-		if (!m_image_sink || m_image_sink->has_error)
+		if (check_mode() == recording_mode::stopped)
 		{
-			g_recording_mode = recording_mode::stopped;
-			rsx::overlays::queue_message(localized_string_id::RECORDING_ABORTED);
+			return;
 		}
 
-		if (g_recording_mode == recording_mode::stopped)
+		// Calculate presentation timestamp.
+		const usz elapsed_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count();
+		ensure(elapsed_us >= m_pause_time_us);
+
+		const usz timestamp_ms = (elapsed_us - m_pause_time_us) / 1000;
+		const s64 pts = m_video_sink->get_pts(timestamp_ms);
+
+		// We can just skip this frame if it has the same timestamp.
+		if (pts <= m_last_video_pts_incoming)
 		{
-			m_active = false;
 			return;
 		}
 
-		if (!m_active.exchange(true))
+		if (m_video_sink->add_frame(data, pitch, width, height, is_bgra ? AVPixelFormat::AV_PIX_FMT_BGRA : AVPixelFormat::AV_PIX_FMT_RGBA, timestamp_ms))
 		{
-			m_current_encoder_frame = 0;
-			m_last_pts_incoming = -1;
+			m_last_video_pts_incoming = pts;
+			m_current_encoder_frame++;
+		}
+	}
+
+	bool video_provider::can_consume_sample()
+	{
+		std::lock_guard lock(m_mutex);
+
+		if (!m_video_sink || !m_video_sink->use_internal_audio)
+			return false;
+
+		const usz elapsed_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count();
+		ensure(elapsed_us >= m_pause_time_us);
+
+		const usz timestamp_us = elapsed_us - m_pause_time_us;
+		const s64 pts = m_video_sink->get_audio_pts(timestamp_us);
+		return pts > m_last_audio_pts_incoming;
+	}
+
+	void video_provider::present_samples(u8* buf, u32 sample_count, u16 channels)
+	{
+		if (!buf || !sample_count || !channels)
+		{
+			return;
 		}
 
-		if (m_current_encoder_frame == 0)
+		std::lock_guard lock(m_mutex);
+
+		if (check_mode() == recording_mode::stopped)
 		{
-			m_encoder_start = steady_clock::now();
+			return;
 		}
 
 		// Calculate presentation timestamp.
-		const usz timestamp_ms = std::chrono::duration_cast<std::chrono::milliseconds>(steady_clock::now() - m_encoder_start).count() - m_pause_time_ms;
-		const s64 pts = m_image_sink->get_pts(timestamp_ms);
+		const usz elapsed_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count();
+		ensure(elapsed_us >= m_pause_time_us);
 
-		// We can just skip this frame if it has the same timestamp.
-		if (pts <= m_last_pts_incoming)
+		const usz timestamp_us = elapsed_us - m_pause_time_us;
+		const s64 pts = m_video_sink->get_audio_pts(timestamp_us);
+
+		// We can just skip this sample if it has the same timestamp.
+		if (pts <= m_last_audio_pts_incoming)
 		{
 			return;
 		}
 
-		m_last_pts_incoming = pts;
-
-		m_current_encoder_frame++;
-		m_image_sink->add_frame(data, pitch, width, height, is_bgra ? AVPixelFormat::AV_PIX_FMT_BGRA : AVPixelFormat::AV_PIX_FMT_RGBA, timestamp_ms);
+		if (m_video_sink->add_audio_samples(buf, sample_count, channels, timestamp_us))
+		{
+			m_last_audio_pts_incoming = pts;
+			m_current_encoder_sample += sample_count;
+		}
 	}
 }
diff --git a/rpcs3/util/video_provider.h b/rpcs3/util/video_provider.h
index 31a051a11283..0e30b01f7ec1 100644
--- a/rpcs3/util/video_provider.h
+++ b/rpcs3/util/video_provider.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "image_sink.h"
+#include "video_sink.h"
 
 enum class recording_mode
 {
@@ -17,20 +17,28 @@ namespace utils
 		video_provider() = default;
 		~video_provider();
 
-		bool set_image_sink(std::shared_ptr<image_sink> sink, recording_mode type);
-		void set_pause_time(usz pause_time_ms);
+		bool set_video_sink(std::shared_ptr<video_sink> sink, recording_mode type);
+		void set_pause_time_us(usz pause_time_us);
+
 		bool can_consume_frame();
 		void present_frame(std::vector<u8>& data, u32 pitch, u32 width, u32 height, bool is_bgra);
 
+		bool can_consume_sample();
+		void present_samples(u8* buf, u32 sample_count, u16 channels);
+
 	private:
+		recording_mode check_mode();
+
 		recording_mode m_type = recording_mode::stopped;
-		std::shared_ptr<image_sink> m_image_sink;
+		std::shared_ptr<video_sink> m_video_sink;
 		shared_mutex m_mutex{};
 		atomic_t<bool> m_active{false};
 		atomic_t<usz> m_current_encoder_frame{0};
+		atomic_t<usz> m_current_encoder_sample{0};
 		steady_clock::time_point m_encoder_start{};
-		s64 m_last_pts_incoming = -1;
-		usz m_pause_time_ms = 0;
+		s64 m_last_video_pts_incoming = -1;
+		s64 m_last_audio_pts_incoming = -1;
+		usz m_pause_time_us = 0;
 	};
 
 } // namespace utils
diff --git a/rpcs3/util/video_sink.h b/rpcs3/util/video_sink.h
new file mode 100644
index 000000000000..9f1aadd65e21
--- /dev/null
+++ b/rpcs3/util/video_sink.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include "util/types.hpp"
+#include "util/atomic.hpp"
+#include "Utilities/mutex.h"
+
+#include <deque>
+#include <cmath>
+
+namespace utils
+{
+	class video_sink
+	{
+	public:
+		video_sink() = default;
+
+		virtual void stop(bool flush = true) = 0;
+		virtual void pause(bool flush = true) = 0;
+		virtual void resume() = 0;
+
+		bool add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms)
+		{
+			// Do not allow new frames while flushing or paused
+			if (m_flush || m_paused)
+				return false;
+
+			std::lock_guard lock(m_mtx);
+			m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
+			return true;
+		}
+
+		bool add_audio_samples(const u8* buf, u32 sample_count, u16 channels, usz timestamp_us)
+		{
+			// Do not allow new samples while flushing or paused
+			if (m_flush || m_paused || !buf || !sample_count || !channels)
+				return false;
+
+			std::vector<u8> sample(buf, buf + sample_count * channels * sizeof(f32));
+			std::lock_guard lock(m_audio_mtx);
+			m_samples_to_encode.emplace_back(timestamp_us, sample_count, channels, std::move(sample));
+			return true;
+		}
+
+		s64 get_pts(usz timestamp_ms) const
+		{
+			return static_cast<s64>(std::round((timestamp_ms * m_framerate) / 1000.0));
+		}
+
+		s64 get_audio_pts(usz timestamp_us) const
+		{
+			static constexpr f64 us_per_sec = 1000000.0;
+			const f64 us_per_block = us_per_sec / (m_sample_rate / static_cast<f64>(m_samples_per_block));
+			return static_cast<s64>(std::round(timestamp_us / us_per_block));
+		}
+
+		usz get_timestamp_ms(s64 pts) const
+		{
+			return static_cast<usz>(std::round((pts * 1000) / static_cast<f64>(m_framerate)));
+		}
+
+		usz get_audio_timestamp_us(s64 pts) const
+		{
+			static constexpr f64 us_per_sec = 1000000.0;
+			const f64 us_per_block = us_per_sec / (m_sample_rate / static_cast<f64>(m_samples_per_block));
+			return static_cast<usz>(pts * us_per_block);
+		}
+
+		atomic_t<bool> has_error{false};
+
+		struct encoder_frame
+		{
+			encoder_frame() = default;
+			encoder_frame(usz timestamp_ms, u32 pitch, u32 width, u32 height, s32 av_pixel_format, std::vector<u8>&& data)
+				: timestamp_ms(timestamp_ms), pitch(pitch), width(width), height(height), av_pixel_format(av_pixel_format), data(std::move(data))
+			{}
+
+			s64 pts = -1; // Optional
+			usz timestamp_ms = 0;
+			u32 pitch = 0;
+			u32 width = 0;
+			u32 height = 0;
+			s32 av_pixel_format = 0; // NOTE: Make sure this is a valid AVPixelFormat
+			std::vector<u8> data;
+		};
+
+		struct encoder_sample
+		{
+			encoder_sample() = default;
+			encoder_sample(usz timestamp_us, u32 sample_count, u16 channels, std::vector<u8>&& data)
+				: timestamp_us(timestamp_us), sample_count(sample_count), channels(channels), data(std::move(data))
+			{
+			}
+
+			usz timestamp_us = 0;
+			u32 sample_count = 0;
+			u16 channels = 0;
+			std::vector<u8> data;
+		};
+
+		// These two variables should only be set once before we start encoding, so we don't need mutexes or atomics.
+		bool use_internal_audio = false; // True if we want to fetch samples from cellAudio
+		bool use_internal_video = false; // True if we want to fetch frames from rsx
+
+	protected:
+		shared_mutex m_mtx;
+		std::deque<encoder_frame> m_frames_to_encode;
+		shared_mutex m_audio_mtx;
+		std::deque<encoder_sample> m_samples_to_encode;
+		atomic_t<bool> m_paused = false;
+		atomic_t<bool> m_flush = false;
+		u32 m_framerate = 30;
+		u32 m_sample_rate = 48000;
+		static constexpr u32 m_samples_per_block = 256;
+	};
+}