Skip to content

Commit

Permalink
rsx/gl/vk: Fixes and optimizations
Browse files Browse the repository at this point in the history
- opengl driver optimization for nvidia. On nvidia glTextureBufferRange performance is horrendous
-- Initialize texture buffer to whole buffer at startup and use absolute offsets to read data instead
-- Over 2x performance in some cases (Resogun, TNT racers)
- gl/vk: Do not flip non-existent display buffers. Fixes spec violation at boot in TNT racers demo
- whitespace fixes for sys_rsx
  • Loading branch information
kd-11 committed Jan 21, 2018
1 parent 15089a0 commit ccc62e3
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 130 deletions.
27 changes: 19 additions & 8 deletions rpcs3/Emu/Cell/lv2/sys_rsx.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

struct RsxDriverInfo {
struct RsxDriverInfo
{
be_t<u32> version_driver; // 0x0
be_t<u32> version_gpu; // 0x4
be_t<u32> memory_size; // 0x8
Expand All @@ -15,7 +16,9 @@ struct RsxDriverInfo {
be_t<u32> unk3[6]; // 0x38-0x54
be_t<u32> systemModeFlags; // 0x54
u8 unk4[0x1064]; // 0x10B8
struct Head {

struct Head
{
be_t<u64> lastFlipTime; // 0x0 last flip time
be_t<u32> flipFlags; // 0x8 flags to handle flip/queue
be_t<u32> unk1; // 0xC
Expand All @@ -29,6 +32,7 @@ struct RsxDriverInfo {
be_t<u32> unk; // 0x38 possible u32, 'flip field', top/bottom for interlaced
be_t<u32> unk5; // 0x3C possible high bits of time stamp? used in getlastVBlankTime
} head[8]; // size = 0x40, 0x200

be_t<u32> unk7; // 0x12B8
be_t<u32> unk8; // 0x12BC
be_t<u32> handlers; // 0x12C0 -- flags showing which handlers are set
Expand All @@ -46,10 +50,12 @@ struct RsxDriverInfo {
be_t<u32> lastError; // 0x12F4 error param for cellGcmSetGraphicsHandler
// todo: theres more to this
};

static_assert(sizeof(RsxDriverInfo) == 0x12F8, "rsxSizeTest");
static_assert(sizeof(RsxDriverInfo::Head) == 0x40, "rsxHeadSizeTest");

struct RsxDmaControl {
struct RsxDmaControl
{
u8 resv[0x40];
atomic_be_t<u32> put;
atomic_be_t<u32> get;
Expand All @@ -58,30 +64,35 @@ struct RsxDmaControl {
be_t<u32> unk1;
};

struct RsxSemaphore {
struct RsxSemaphore
{
be_t<u32> val;
be_t<u32> pad;
be_t<u64> timestamp;
};

struct RsxNotify {
struct RsxNotify
{
be_t<u64> timestamp;
be_t<u64> zero;
};

struct RsxReport {
struct RsxReport
{
be_t<u64> timestamp;
be_t<u32> val;
be_t<u32> pad;
};

struct RsxReports {
struct RsxReports
{
RsxSemaphore semaphore[0x100];
RsxNotify notify[64];
RsxReport report[2048];
};

struct RsxDisplayInfo {
struct RsxDisplayInfo
{
be_t<u32> offset;
be_t<u32> pitch;
be_t<u32> width;
Expand Down
186 changes: 99 additions & 87 deletions rpcs3/Emu/RSX/GL/GLGSRender.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,7 @@ void GLGSRender::end()
}

//Do vertex upload before RTT prep / texture lookups to give the driver time to push data
u32 vertex_draw_count;
u32 actual_vertex_count;
u32 vertex_base;
std::optional<std::tuple<GLenum, u32> > indexed_draw_info;
std::tie(vertex_draw_count, actual_vertex_count, vertex_base, indexed_draw_info) = set_vertex_buffer();
auto upload_info = set_vertex_buffer();

//Load textures
{
Expand Down Expand Up @@ -294,7 +290,7 @@ void GLGSRender::end()
std::chrono::time_point<steady_clock> program_start = steady_clock::now();
//Load program here since it is dependent on vertex state

load_program(vertex_base, actual_vertex_count);
load_program(upload_info);

std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
Expand Down Expand Up @@ -492,10 +488,10 @@ void GLGSRender::end()
const GLenum draw_mode = gl::draw_mode(rsx::method_registers.current_draw_clause.primitive);
bool single_draw = !supports_multidraw || (rsx::method_registers.current_draw_clause.first_count_commands.size() <= 1 || rsx::method_registers.current_draw_clause.is_disjoint_primitive);

if (indexed_draw_info)
if (upload_info.index_info)
{
const GLenum index_type = std::get<0>(indexed_draw_info.value());
const u32 index_offset = std::get<1>(indexed_draw_info.value());
const GLenum index_type = std::get<0>(upload_info.index_info.value());
const u32 index_offset = std::get<1>(upload_info.index_info.value());
const bool restarts_valid = gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive) && !rsx::method_registers.current_draw_clause.is_disjoint_primitive;

if (gl_state.enable(restarts_valid && rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART))
Expand All @@ -505,7 +501,7 @@ void GLGSRender::end()

if (single_draw)
{
glDrawElements(draw_mode, vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
}
else
{
Expand Down Expand Up @@ -535,7 +531,7 @@ void GLGSRender::end()
{
if (single_draw)
{
glDrawArrays(draw_mode, 0, vertex_draw_count);
glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count);
}
else
{
Expand Down Expand Up @@ -652,16 +648,25 @@ void GLGSRender::on_init_thread()
//Use industry standard resource alignment values as defaults
m_uniform_buffer_offset_align = 256;
m_min_texbuffer_alignment = 256;
m_max_texbuffer_size = 0;

glEnable(GL_VERTEX_PROGRAM_POINT_SIZE);
glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_offset_align);
glGetIntegerv(GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT, &m_min_texbuffer_alignment);
glGetIntegerv(GL_MAX_TEXTURE_BUFFER_SIZE, &m_max_texbuffer_size);
m_vao.create();

//Set min alignment to 16-bytes for SSE optimizations with aligned addresses to work
m_min_texbuffer_alignment = std::max(m_min_texbuffer_alignment, 16);
m_uniform_buffer_offset_align = std::max(m_uniform_buffer_offset_align, 16);

LOG_NOTICE(RSX, "Supported texel buffer size reported: %d bytes", m_max_texbuffer_size);
if (m_max_texbuffer_size < (16 * 0x100000))
{
LOG_ERROR(RSX, "Max texture buffer size supported is less than 16M which is useless. Expect undefined behaviour.");
m_max_texbuffer_size = (16 * 0x100000);
}

const u32 texture_index_offset = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;

//Array stream buffer
Expand Down Expand Up @@ -709,11 +714,14 @@ void GLGSRender::on_init_thread()
m_index_ring_buffer.reset(new gl::ring_buffer());
}

m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000);
m_index_ring_buffer->create(gl::buffer::target::element_array, 64 * 0x100000);
m_transform_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_fragment_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_vertex_state_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_attrib_ring_buffer->create(gl::buffer::target::texture, std::min<GLsizeiptr>(m_max_texbuffer_size, 256 * 0x100000));
m_index_ring_buffer->create(gl::buffer::target::element_array, std::min<GLsizeiptr>(m_max_texbuffer_size, 64 * 0x100000));
m_transform_constants_buffer->create(gl::buffer::target::uniform, std::min<GLsizeiptr>(m_max_texbuffer_size, 16 * 0x100000));
m_fragment_constants_buffer->create(gl::buffer::target::uniform, std::min<GLsizeiptr>(m_max_texbuffer_size, 16 * 0x100000));
m_vertex_state_buffer->create(gl::buffer::target::uniform, std::min<GLsizeiptr>(m_max_texbuffer_size, 16 * 0x100000));

m_gl_persistent_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, 0, (u32)m_attrib_ring_buffer->size());
m_gl_volatile_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, 0, (u32)m_attrib_ring_buffer->size());

m_vao.element_array_buffer = *m_index_ring_buffer;

Expand Down Expand Up @@ -999,7 +1007,7 @@ bool GLGSRender::check_program_state()
return (rsx::method_registers.shader_program_address() != 0);
}

void GLGSRender::load_program(u32 vertex_base, u32 vertex_count)
void GLGSRender::load_program(const vertex_upload_info& upload_info)
{
get_current_fragment_program(fs_sampler_state);
verify(HERE), current_fragment_program.valid;
Expand Down Expand Up @@ -1055,11 +1063,11 @@ void GLGSRender::load_program(u32 vertex_base, u32 vertex_count)
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
*(reinterpret_cast<u32*>(buf + 132)) = upload_info.vertex_index_base;
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.point_size();
*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_min();
*(reinterpret_cast<f32*>(buf + 144)) = rsx::method_registers.clip_max();
fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 160));
fill_vertex_layout_state(m_vertex_layout, upload_info.allocated_vertex_count, reinterpret_cast<s32*>(buf + 160), upload_info.persistent_mapping_offset, upload_info.volatile_mapping_offset);

if (m_transform_constants_dirty)
{
Expand Down Expand Up @@ -1223,96 +1231,100 @@ void GLGSRender::flip(int buffer)
return;
}

gl::screen.clear(gl::buffers::color);

u32 buffer_width = display_buffers[buffer].width;
u32 buffer_height = display_buffers[buffer].height;
u32 buffer_pitch = display_buffers[buffer].pitch;

// Calculate blit coordinates
coordi aspect_ratio;
sizei csize(m_frame->client_width(), m_frame->client_height());
sizei new_size = csize;

if (!g_cfg.video.stretch_to_display_area)
if (buffer < display_buffers_count && buffer_width && buffer_height && buffer_pitch)
{
const double aq = (double)buffer_width / buffer_height;
const double rq = (double)new_size.width / new_size.height;
const double q = aq / rq;
// Calculate blit coordinates
coordi aspect_ratio;
sizei csize(m_frame->client_width(), m_frame->client_height());
sizei new_size = csize;

if (q > 1.0)
{
new_size.height = int(new_size.height / q);
aspect_ratio.y = (csize.height - new_size.height) / 2;
}
else if (q < 1.0)
if (!g_cfg.video.stretch_to_display_area)
{
new_size.width = int(new_size.width * q);
aspect_ratio.x = (csize.width - new_size.width) / 2;
}
}
const double aq = (double)buffer_width / buffer_height;
const double rq = (double)new_size.width / new_size.height;
const double q = aq / rq;

aspect_ratio.size = new_size;

// Find the source image
rsx::tiled_region buffer_region = get_tiled_address(display_buffers[buffer].offset, CELL_GCM_LOCATION_LOCAL);
u32 absolute_address = buffer_region.address + buffer_region.base;
if (q > 1.0)
{
new_size.height = int(new_size.height / q);
aspect_ratio.y = (csize.height - new_size.height) / 2;
}
else if (q < 1.0)
{
new_size.width = int(new_size.width * q);
aspect_ratio.x = (csize.width - new_size.width) / 2;
}
}

m_flip_fbo.recreate();
m_flip_fbo.bind();
aspect_ratio.size = new_size;

const u32 size = buffer_pitch * buffer_height;
if (auto render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address))
{
buffer_width = render_target_texture->width();
buffer_height = render_target_texture->height();
// Find the source image
rsx::tiled_region buffer_region = get_tiled_address(display_buffers[buffer].offset, CELL_GCM_LOCATION_LOCAL);
u32 absolute_address = buffer_region.address + buffer_region.base;

m_flip_fbo.color = *render_target_texture;
m_flip_fbo.read_buffer(m_flip_fbo.color);
}
else if (auto surface = m_gl_texture_cache.find_texture_from_dimensions(absolute_address))
{
//Hack - this should be the first location to check for output
//The render might have been done offscreen or in software and a blit used to display
m_flip_fbo.color = surface->get_raw_view();
m_flip_fbo.read_buffer(m_flip_fbo.color);
}
else
{
LOG_WARNING(RSX, "Flip texture was not found in cache. Uploading surface from CPU");
m_flip_fbo.recreate();
m_flip_fbo.bind();

if (!m_flip_tex_color || m_flip_tex_color.size() != sizei{ (int)buffer_width, (int)buffer_height })
const u32 size = buffer_pitch * buffer_height;
if (auto render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address))
{
m_flip_tex_color.recreate(gl::texture::target::texture2D);

m_flip_tex_color.config()
.size({ (int)buffer_width, (int)buffer_height })
.type(gl::texture::type::uint_8_8_8_8)
.format(gl::texture::format::bgra);
buffer_width = render_target_texture->width();
buffer_height = render_target_texture->height();

m_flip_tex_color.pixel_unpack_settings().aligment(1).row_length(buffer_pitch / 4);
m_flip_fbo.color = *render_target_texture;
m_flip_fbo.read_buffer(m_flip_fbo.color);
}

if (buffer_region.tile)
else if (auto surface = m_gl_texture_cache.find_texture_from_dimensions(absolute_address))
{
std::unique_ptr<u8[]> temp(new u8[buffer_height * buffer_pitch]);
buffer_region.read(temp.get(), buffer_width, buffer_height, buffer_pitch);
m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
//Hack - this should be the first location to check for output
//The render might have been done offscreen or in software and a blit used to display
m_flip_fbo.color = surface->get_raw_view();
m_flip_fbo.read_buffer(m_flip_fbo.color);
}
else
{
m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
}
LOG_WARNING(RSX, "Flip texture was not found in cache. Uploading surface from CPU");

m_flip_fbo.color = m_flip_tex_color;
m_flip_fbo.read_buffer(m_flip_fbo.color);
}
if (!m_flip_tex_color || m_flip_tex_color.size() != sizei{ (int)buffer_width, (int)buffer_height })
{
m_flip_tex_color.recreate(gl::texture::target::texture2D);

// Blit source image to the screen
// Disable scissor test (affects blit)
glDisable(GL_SCISSOR_TEST);
m_flip_tex_color.config()
.size({ (int)buffer_width, (int)buffer_height })
.type(gl::texture::type::uint_8_8_8_8)
.format(gl::texture::format::bgra);

areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height });
gl::screen.clear(gl::buffers::color);
m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear);
m_flip_tex_color.pixel_unpack_settings().aligment(1).row_length(buffer_pitch / 4);
}

if (buffer_region.tile)
{
std::unique_ptr<u8[]> temp(new u8[buffer_height * buffer_pitch]);
buffer_region.read(temp.get(), buffer_width, buffer_height, buffer_pitch);
m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
}
else
{
m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
}

m_flip_fbo.color = m_flip_tex_color;
m_flip_fbo.read_buffer(m_flip_fbo.color);
}

// Blit source image to the screen
// Disable scissor test (affects blit)
glDisable(GL_SCISSOR_TEST);

areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height });
m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear);
}

if (m_custom_ui)
{
Expand Down
Loading

0 comments on commit ccc62e3

Please sign in to comment.