Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bunch of microoptimizations #1600

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
35 changes: 18 additions & 17 deletions src/Layers/xrRender_R2/r2_R_lights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ void CRender::render_lights(light_Package& LP)
L->vis_update();
if (!L->vis.visible)
{
source.erase(source.begin() + it);
std::swap(source[it], source.back());
source.pop_back();
it--;
}
else
Expand All @@ -42,25 +43,27 @@ void CRender::render_lights(light_Package& LP)
}
}

// 2. refactor - infact we could go from the backside and sort in ascending order
olefirenque marked this conversation as resolved.
Show resolved Hide resolved
// 2. refactor
{
xr_vector<light*>& source = LP.v_shadowed;
xr_vector<light*> refactored;
refactored.reserve(source.size());
const size_t total = source.size();

std::sort(source.begin(), source.end(), [](light* l1, light* l2)
{
const u32 a0 = l1->X.S.size;
const u32 a1 = l2->X.S.size;
return a0 < a1; // ascending order
});

for (u16 smap_ID = 0; refactored.size() != total; ++smap_ID)
{
LP_smap_pool.initialize(RImplementation.o.smapsize);
std::sort(source.begin(), source.end(), [](light* l1, light* l2)
{
const u32 a0 = l1->X.S.size;
const u32 a1 = l2->X.S.size;
return a0 > a1; // reverse -> descending
});
for (size_t test = 0; test < source.size(); ++test)
for (auto& L : source)
{
light* L = source[test];
if (!L)
continue;
SMAP_Rect R{};
if (LP_smap_pool.push(R, L->X.S.size))
{
Expand All @@ -69,14 +72,12 @@ void CRender::render_lights(light_Package& LP)
L->X.S.posY = R.min.y;
L->vis.smap_ID = smap_ID;
refactored.push_back(L);
source.erase(source.begin() + test);
--test;
L = nullptr;
}
}
}

// save (lights are popped from back)
std::reverse(refactored.begin(), refactored.end());
LP.v_shadowed = std::move(refactored);
}

Expand Down Expand Up @@ -129,7 +130,7 @@ void CRender::render_lights(light_Package& LP)
}
};

const auto& flush_lights = [&]()
const auto& flush_lights = [this] (xr_vector<task_data_t>& lights_queue)
{
for (const auto& [L, task, batch_id] : lights_queue)
{
Expand Down Expand Up @@ -178,7 +179,7 @@ void CRender::render_lights(light_Package& LP)
}

L->svis[batch_id].end(); // NOTE(DX11): occqs are fetched here, this should be done on the imm context only
RImplementation.release_context(batch_id);
release_context(batch_id);
}

lights_queue.clear();
Expand Down Expand Up @@ -206,7 +207,7 @@ void CRender::render_lights(light_Package& LP)
if (batch_id == R_dsgraph_structure::INVALID_CONTEXT_ID)
{
VERIFY(!lights_queue.empty());
flush_lights();
flush_lights(lights_queue);
continue;
}

Expand All @@ -228,7 +229,7 @@ void CRender::render_lights(light_Package& LP)
}
lights_queue.emplace_back(data);
}
flush_lights(); // in case if something left
flush_lights(lights_queue); // in case if something left

cmd_list.Invalidate();

Expand Down
1 change: 1 addition & 0 deletions src/xrAICore/Navigation/level_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ class XRAICORE_API CLevelGraph
IC void assign_y_values(xr_vector<T>& path);
template <typename P>
IC void iterate_vertices(const Fvector& min_position, const Fvector& max_position, const P& predicate) const;
IC std::pair<CLevelVertex*, CLevelVertex*> get_range(const Fvector& min_position, const Fvector& max_position) const;
IC bool check_vertex_in_direction(u32 start_vertex_id, const Fvector2& start_position, u32 finish_vertex_id) const;
IC u32 check_position_in_direction(
u32 start_vertex_id, const Fvector2& start_position, const Fvector2& finish_position) const;
Expand Down
31 changes: 19 additions & 12 deletions src/xrAICore/Navigation/level_graph_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,27 +572,34 @@ template <typename P>
IC void CLevelGraph::iterate_vertices(
const Fvector& min_position, const Fvector& max_position, const P& predicate) const
{
const auto begin = m_nodes->begin(), end = m_nodes->end();
auto [begin, end] = get_range(min_position, max_position);

CLevelVertex *I, *E;
for (; begin != end; ++begin)
predicate(*begin);
}

IC std::pair<CLevelGraph::CLevelVertex*, CLevelGraph::CLevelVertex*> CLevelGraph::get_range(const Fvector& min_position, const Fvector& max_position) const
{
const auto _begin = m_nodes->begin(), _end = m_nodes->end();

CLevelVertex *begin, *end;
if (valid_vertex_position(min_position))
I = std::lower_bound(
begin, end, vertex_position(min_position).xz(), &vertex::predicate2);
begin = std::lower_bound(
_begin, _end, vertex_position(min_position).xz(), &vertex::predicate2);
else
I = begin;
begin = _begin;

if (valid_vertex_position(max_position))
{
E = std::upper_bound(
begin, end, vertex_position(max_position).xz(), &vertex::predicate);
if (E != (end))
++E;
end = std::upper_bound(
_begin, _end, vertex_position(max_position).xz(), &vertex::predicate);
if (end != (_end))
++end;
}
else
E = end;
end = _end;

for (; I != E; ++I)
predicate(*I);
return { begin, end };
}

IC u32 CLevelGraph::max_x() const { return (m_max_x); }
Expand Down
3 changes: 1 addition & 2 deletions src/xrGame/Level.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,7 @@ void CLevel::OnFrame()
if (g_mt_config.test(mtMap))
{
R_ASSERT(m_map_manager);
Device.seqParallel.push_back(
fastdelegate::FastDelegate0<>(m_map_manager, &CMapManager::Update));
Device.seqParallel.emplace_back(m_map_manager, &CMapManager::Update);
}
else
MapManager().Update();
Expand Down
39 changes: 22 additions & 17 deletions src/xrGame/cover_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,37 +74,42 @@ void CCoverManager::compute_static_cover()
{
clear();
xr_delete(m_covers);
m_covers = xr_new<CPointQuadTree>(
ai().level_graph().header().box(), ai().level_graph().header().cell_size() * .5f, 8 * 65536, 4 * 65536);
m_temp.resize(ai().level_graph().header().vertex_count());

const CLevelGraph& graph = ai().level_graph();
const u32 levelVertexCount = ai().level_graph().header().vertex_count();
const LevelGraph::CHeader& levelGraphHeader = graph.header();
const u32 levelVertexCount = levelGraphHeader.vertex_count();

m_covers = xr_new<CPointQuadTree>(levelGraphHeader.box(), levelGraphHeader.cell_size() * .5f, 8 * 65536, 4 * 65536);

// avoiding extra allocations with a static storage for m_covers
static xr_vector<std::optional<CCoverPoint>> quadTreeStaticStorage;
Copy link
Contributor Author

@olefirenque olefirenque Feb 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this is a good trade-off between allocs/RSS. It would be nice to hear an opinion on this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not recommended to use global/static objects which allocate memory dynamically.

quadTreeStaticStorage.resize(levelVertexCount);
m_temp.resize(levelVertexCount);

xr_parallel_for(TaskRange<u32>(0, levelVertexCount), [&](const TaskRange<u32>& range)
{
for (u32 i = range.begin(); i != range.end(); ++i)
{
m_temp[i] = false;
quadTreeStaticStorage[i] = std::nullopt;

const CLevelGraph::CLevelVertex& vertex = *graph.vertex(i);
if (vertex.high_cover(0) + vertex.high_cover(1) + vertex.high_cover(2) + vertex.high_cover(3))
{
m_temp[i] = edge_vertex(i);
continue;
}
const int highCover = vertex.high_cover(0) + vertex.high_cover(1) + vertex.high_cover(2) + vertex.high_cover(3);
const int lowCover = vertex.low_cover(0) + vertex.low_cover(1) + vertex.low_cover(2) + vertex.low_cover(3);

if (vertex.low_cover(0) + vertex.low_cover(1) + vertex.low_cover(2) + vertex.low_cover(3))
if (highCover || lowCover)
{
m_temp[i] = edge_vertex(i);
continue;
if (m_temp[i] && critical_cover(i))
{
quadTreeStaticStorage[i] = CCoverPoint(graph.vertex_position(graph.vertex(i)), i);
}
}

m_temp[i] = false;
}
});

for (u32 i = 0; i < levelVertexCount; ++i)
if (m_temp[i] && critical_cover(i))
m_covers->insert(xr_new<CCoverPoint>(ai().level_graph().vertex_position(ai().level_graph().vertex(i)), i));
for (auto& p : quadTreeStaticStorage)
if (p.has_value())
Copy link
Contributor Author

@olefirenque olefirenque Feb 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seemed that accessing m_temp and calling critical_cover took some time here

for (u32 i = 0; i < levelVertexCount; ++i)
        if (m_temp[i] && critical_cover(i))
            m_covers->insert(xr_new<CCoverPoint>(ai().level_graph().vertex_position(ai().level_graph().vertex(i)), i));

image

m_covers->insert(&p.value());

VERIFY(!m_smart_covers_storage);
m_smart_covers_storage = xr_new<smart_cover::storage>();
Expand Down
4 changes: 3 additions & 1 deletion src/xrGame/cover_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ class CCoverManager

protected:
CPointQuadTree* m_covers;
xr_vector<bool> m_temp;
// vector<bool> is not applicable for `m_temp`
// since it is filled in parallel_for (https://timsong-cpp.github.io/cppwp/container.requirements.dataraces).
xr_vector<int> m_temp;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose it wasn't thread-safe before

mutable PointVector m_nearest;

private:
Expand Down
40 changes: 38 additions & 2 deletions src/xrGame/space_restriction_shape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "xrAICore/Navigation/level_graph.h"
#include "space_restrictor.h"
#include "xrAICore/Navigation/graph_engine.h"
#include "xrCore/Threading/ParallelFor.hpp"

struct CBorderMergePredicate
{
Expand Down Expand Up @@ -79,10 +80,45 @@ void CSpaceRestrictionShape::fill_shape(const CCF_Shape::shape_def& shape)
}
default: NODEFAULT;
}
ai().level_graph().iterate_vertices(start, dest, CBorderMergePredicate(this));

CLevelGraph& graph = ai().level_graph();

std::mutex mergeMutex;

auto [begin, end] = graph.get_range(start, dest);
xr_parallel_for(TaskRange(begin, end), [this, &mergeMutex, &graph] (const auto& range)
{
xr_vector<u32> m_border_chunk;
m_border_chunk.reserve(range.size());
for (const auto& vertex : range)
{
if (inside(graph.vertex_id(&vertex), true) &&
!inside(graph.vertex_id(&vertex), false))
m_border_chunk.push_back(graph.vertex_id(&vertex));
}
std::lock_guard lock(mergeMutex);
if (m_border.capacity() < m_border.size() + m_border_chunk.size())
m_border.reserve(m_border.size() + m_border_chunk.size());
for (auto x : m_border_chunk)
m_border.push_back(x);
Comment on lines +94 to +103
Copy link
Contributor Author

@olefirenque olefirenque Feb 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a little research on this, and it seemed to me that the body of CBorderMergePredicate::operator(...) is a bigger problem than merging a certain number of chunks under lock
image

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, it might be irrelevant since I accidentally used UI Freeze option.
I'm sorry for the misinformation. Apparently, this flame graph represents only those samples that were sampled when the NPC's spawning was lagging.

The UI Freeze event indicates time intervals where the application was unable to respond to user input. More specifically, these are time intervals where window messages were not pumped for more than 200 ms or processing of a particular message took more than 200 ms.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@olefirenque, what profiler did you use?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me this strongly looks like tracy but I might be wrong there

});

#ifdef DEBUG
ai().level_graph().iterate_vertices(start, dest, CShapeTestPredicate(this));
xr_parallel_for(TaskRange(begin, end), [this, &mergeMutex, &graph] (const auto& range)
{
xr_vector<u32> m_test_storage_chunk;
m_test_storage_chunk.reserve(range.size());
for (const auto& vertex : range)
{
if (inside(graph.vertex_id(&vertex), false))
m_test_storage_chunk.push_back(graph.vertex_id(&vertex));
}
std::lock_guard lock(mergeMutex);
if (m_test_storage.capacity() < m_test_storage.size() + m_test_storage_chunk.size())
m_test_storage.reserve(m_test_storage.size() + m_test_storage_chunk.size());
for (auto x : m_test_storage_chunk)
m_test_storage.push_back(x);
});
#endif
}

Expand Down
9 changes: 8 additions & 1 deletion src/xrGame/space_restrictor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,14 @@ void CSpaceRestrictor::net_Destroy()
bool CSpaceRestrictor::inside(const Fsphere& sphere) const
{
if (!actual())
prepare();
{
static std::mutex prepareMutex;
std::lock_guard lock(prepareMutex);

// Double-checked locking
if (!actual())
prepare();
}
Comment on lines 102 to +110
Copy link
Contributor Author

@olefirenque olefirenque Feb 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it was only non-concurrent part for xr_parallel_for in CSpaceRestrictionShape::fill_shape. Since it is called only once for a sequence of CSpaceRestrictor::inside calls, maybe it would be better to extract this to reduce contention


if (!m_selfbounds.intersect(sphere))
return (false);
Expand Down
Loading