rsx: Emit simpler fragment program code

- Optimize clamp16 - Use bfe instead of shift-and
RPCS3 · Sep 26, 2020 · b929ad0 · b929ad0
1 parent bef9555
commit b929ad0
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 63 deletions.
diff --git a/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp
@@ -191,7 +191,7 @@ std::string FragmentProgramDecompiler::AddReg(u32 index, bool fp16)
 	const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
 	const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index);
 
-	return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0., 0., 0., 0.)");
+	return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0.)");
 }
 
 bool FragmentProgramDecompiler::HasReg(u32 index, bool fp16)
@@ -255,12 +255,12 @@ std::string FragmentProgramDecompiler::AddTex()
 
 std::string FragmentProgramDecompiler::AddType3()
 {
-	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "src3", getFloatTypeName(4) + "(1., 1., 1., 1.)");
+	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "src3", getFloatTypeName(4) + "(1.)");
 }
 
 std::string FragmentProgramDecompiler::AddX2d()
 {
-	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0., 0., 0., 0.)");
+	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0.)");
 }
 
 std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision)
@@ -366,6 +366,7 @@ std::string FragmentProgramDecompiler::Format(const std::string& code, bool igno
 std::string FragmentProgramDecompiler::GetRawCond()
 {
 	static constexpr std::string_view f = "xyzw";
+	const auto zero = getFloatTypeName(4) + "(0.)";
 
 	std::string swizzle, cond;
 	swizzle.reserve(5);
@@ -381,17 +382,17 @@ std::string FragmentProgramDecompiler::GetRawCond()
 	}
 
 	if (src0.exec_if_gr && src0.exec_if_eq)
-		cond = compareFunction(COMPARE::FUNCTION_SGE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SGE, AddCond() + swizzle, zero);
 	else if (src0.exec_if_lt && src0.exec_if_eq)
-		cond = compareFunction(COMPARE::FUNCTION_SLE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SLE, AddCond() + swizzle, zero);
 	else if (src0.exec_if_gr && src0.exec_if_lt)
-		cond = compareFunction(COMPARE::FUNCTION_SNE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SNE, AddCond() + swizzle, zero);
 	else if (src0.exec_if_gr)
-		cond = compareFunction(COMPARE::FUNCTION_SGT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SGT, AddCond() + swizzle, zero);
 	else if (src0.exec_if_lt)
-		cond = compareFunction(COMPARE::FUNCTION_SLT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SLT, AddCond() + swizzle, zero);
 	else //if(src0.exec_if_eq)
-		cond = compareFunction(COMPARE::FUNCTION_SEQ, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
+		cond = compareFunction(COMPARE::FUNCTION_SEQ, AddCond() + swizzle, zero);
 
 	return cond;
 }
@@ -698,7 +699,7 @@ std::string FragmentProgramDecompiler::BuildCode()
 
 	const bool fp16_out = !(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
 	const std::string float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
-	const std::string init_value = float4_type + "(0., 0., 0., 0.)";
+	const std::string init_value = float4_type + "(0.)";
 	std::array<std::string, 4> output_register_names;
 	std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
 	bool shader_is_valid = false;
@@ -769,7 +770,7 @@ std::string FragmentProgramDecompiler::BuildCode()
 		"{\n"
 		"	// Treat NaNs as 0\n"
 		"	bvec4 nans = isnan(x);\n"
-		"	x = _select(x, $float4(0., 0., 0., 0.), nans);\n"
+		"	x = _select(x, $float4(0.), nans);\n"
 		"	return clamp(x, _min, _max);\n"
 		"}\n\n";
 
@@ -780,7 +781,7 @@ std::string FragmentProgramDecompiler::BuildCode()
 			"{\n"
 			"	// Treat NaNs as 0\n"
 			"	bvec4 nans = isnan(x);\n"
-			"	x = _select(x, $half4(0., 0., 0., 0.), nans);\n"
+			"	x = _select(x, $half4(0.), nans);\n"
 			"	return clamp(x, $half_t(_min), $half_t(_max));\n"
 			"}\n\n";
 		}
@@ -791,32 +792,26 @@ std::string FragmentProgramDecompiler::BuildCode()
 	if (!device_props.has_native_half_support)
 	{
 		// Accurate float to half clamping (preserves IEEE-754 NaN)
-		std::string clamp_func =
-		"$float4 clamp16($float4 x)\n"
-		"{\n";
-
+		std::string clamp_func;
 		if (glsl)
 		{
 			clamp_func +=
-			"	uvec4 bits = floatBitsToUint(x);\n"
-			"	uvec4 extend = uvec4(0x7f800000);\n"
-			"	bvec4 test = equal(bits & extend, extend);\n"
-			"	vec4 clamped = clamp(x, -65504., +65504.);\n"
-			"	return _select(clamped, x, test);\n";
+			"vec2 clamp16(vec2 val){ return unpackHalf2x16(packHalf2x16(val)); }\n"
+			"vec4 clamp16(vec4 val){ return vec4(clamp16(val.xy), clamp16(val.zw)); }\n\n";
 		}
 		else
 		{
 			clamp_func +=
+			"$float4 clamp16($float4 x)\n"
+			"{\n"
 			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
 			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
 			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
 			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
-			"	return x;\n";
+			"	return x;\n"
+			"}\n\n";
 		}
 
-		clamp_func +=
-		"}\n\n";
-
 		OS << Format(clamp_func);
 	}
 	else

diff --git a/rpcs3/Emu/RSX/Common/GLSLCommon.h b/rpcs3/Emu/RSX/Common/GLSLCommon.h
@@ -275,13 +275,13 @@ namespace glsl
 		"uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap)\n"
 		"{\n"
 		"	return (swap) ?\n"
-		"		bitfieldInsert(bitfieldInsert(bitfieldInsert(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n"
-		"		bitfieldInsert(bitfieldInsert(bitfieldInsert(x, y, 8, 8), z, 16, 8), w, 24, 8);\n"
+		"		_set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n"
+		"		_set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8);\n"
 		"}\n\n"
 
 		"uint gen_bits(const in uint x, const in uint y, const in bool swap)\n"
 		"{\n"
-		"	return (swap)? bitfieldInsert(y, x, 8, 8) : bitfieldInsert(x, y, 8, 8);\n"
+		"	return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8);\n"
 		"}\n\n"
 
 		"vec4 sext(const in ivec4 bits)\n"
@@ -337,8 +337,8 @@ namespace glsl
 		"	}\n"
 		"	else if (desc.type == VTX_FMT_FLOAT16)\n"
 		"	{\n"
-		"		tmp.x = bitfieldInsert(result.x, result.y, 16, 16);\n"
-		"		tmp.y = bitfieldInsert(result.z, result.w, 16, 16);\n"
+		"		tmp.x = _set_bits(result.x, result.y, 16, 16);\n"
+		"		tmp.y = _set_bits(result.z, result.w, 16, 16);\n"
 		"		ret.xy = unpackHalf2x16(tmp.x);\n"
 		"		ret.zw = unpackHalf2x16(tmp.y);\n"
 		"	}\n"
@@ -348,9 +348,9 @@ namespace glsl
 		"	}\n"
 		"	else //if (desc.type == VTX_FMT_COMP32)\n"
 		"	{\n"
-		"		result = uvec4(bitfieldExtract(result.x, 0, 11),\n"
-		"			bitfieldExtract(result.x, 11, 11),\n"
-		"			bitfieldExtract(result.x, 22, 10),\n"
+		"		result = uvec4(_get_bits(result.x, 0, 11),\n"
+		"			_get_bits(result.x, 11, 11),\n"
+		"			_get_bits(result.x, 22, 10),\n"
 		"			uint(scale.x));\n"
 		"		ret = sext(ivec4(result) << ivec4(5, 5, 6, 0));\n"
 		"	}\n\n"
@@ -395,14 +395,14 @@ namespace glsl
 
 		OS <<
 		"	attribute_desc result;\n"
-		"	result.stride = bitfieldExtract(attrib.x, 0, 8);\n"
-		"	result.frequency = bitfieldExtract(attrib.x, 8, 16);\n"
-		"	result.type = bitfieldExtract(attrib.x, 24, 3);\n"
-		"	result.attribute_size = bitfieldExtract(attrib.x, 27, 3);\n"
-		"	result.starting_offset = bitfieldExtract(attrib.y, 0, 29);\n"
-		"	result.swap_bytes = bitfieldExtract(attrib.y, 29, 1) != 0;\n"
-		"	result.is_volatile = bitfieldExtract(attrib.y, 30, 1) != 0;\n"
-		"	result.modulo = bitfieldExtract(attrib.y, 31, 1) != 0;\n"
+		"	result.stride = _get_bits(attrib.x, 0, 8);\n"
+		"	result.frequency = _get_bits(attrib.x, 8, 16);\n"
+		"	result.type = _get_bits(attrib.x, 24, 3);\n"
+		"	result.attribute_size = _get_bits(attrib.x, 27, 3);\n"
+		"	result.starting_offset = _get_bits(attrib.y, 0, 29);\n"
+		"	result.swap_bytes = _test_bit(attrib.y, 29);\n"
+		"	result.is_volatile = _test_bit(attrib.y, 30);\n"
+		"	result.modulo = _test_bit(attrib.y, 31);\n"
 		"	return result;\n"
 		"}\n\n"
 
@@ -434,14 +434,16 @@ namespace glsl
 	static void insert_rop_init(std::ostream& OS)
 	{
 		OS <<
-		"	if ((rop_control & (1u << 9)) != 0)\n"
+		"	if (_test_bit(rop_control, 9))\n"
 		"	{\n"
 		"		// Convert x,y to linear address\n"
-		"		uvec2 stipple_coord = uvec2(gl_FragCoord.xy) % uvec2(32u, 32u);\n"
-		"		uint address = stipple_coord.y * 32u + stipple_coord.x;\n"
-		"		uint mask = (1u << (address & 31u));\n\n"
+		"		const uvec2 stipple_coord = uvec2(gl_FragCoord.xy) % uvec2(32u, 32u);\n"
+		"		const uint address = fma(stipple_coord.y, 32u, stipple_coord.x);\n"
+		"		const uint bit_offset = (address & 31u);\n"
+		"		const uint word_index = _get_bits(address, 7, 3);\n"
+		"		const uint sub_index = _get_bits(address, 5, 2);\n\n"
 
-		"		if ((stipple_pattern[address >> 7u][(address >> 5u) & 3u] & mask) == 0u)\n"
+		"		if (_test_bit(stipple_pattern[word_index][sub_index], bit_offset))\n"
 		"		{\n"
 		"			_kill();\n"
 		"		}\n"
@@ -463,26 +465,26 @@ namespace glsl
 			"	{\n"
 			"		discard;\n"
 			"	}\n"
-			"	else if ((rop_control & 0xFFu) != 0)\n";
+			"	else if (_get_bits(rop_control, 0, 8) != 0)\n";
 		}
 		else
 		{
-			OS << "	if ((rop_control & 0xFFu) != 0)\n";
+			OS << "	if (_get_bits(rop_control, 0, 8) != 0)\n";
 		}
 
 		OS <<
 		"	{\n"
-		"		bool alpha_test = (rop_control & 0x1u) > 0;\n"
-		"		uint alpha_func = ((rop_control >> 16) & 0x7u);\n";
+		"		const bool alpha_test = _test_bit(rop_control, 0);\n"
+		"		const uint alpha_func = _get_bits(rop_control, 16, 3);\n";
 
 		if (!props.fp32_outputs)
 		{
-			OS << "		bool srgb_convert = (rop_control & 0x2u) > 0;\n\n";
+			OS << "		const bool srgb_convert = _test_bit(rop_control, 1);\n\n";
 		}
 
 		if (props.emulate_coverage_tests)
 		{
-			OS << "		bool a2c_enabled = (rop_control & 0x10u) > 0;\n";
+			OS << "		const bool a2c_enabled = _test_bit(rop_control, 4);\n";
 		}
 
 		OS <<
@@ -540,6 +542,9 @@ namespace glsl
 	{
 		OS << "#define _select mix\n";
 		OS << "#define _saturate(x) clamp(x, 0., 1.)\n";
+		OS << "#define _get_bits(x, off, count) bitfieldExtract(x, off, count)\n";
+		OS << "#define _set_bits(x, y, off, count) bitfieldInsert(x, y, off, count)\n";
+		OS << "#define _test_bit(x, y) (_get_bits(x, y, 1) != 0)\n";
 		OS << "#define _rand(seed) fract(sin(dot(seed.xy, vec2(12.9898f, 78.233f))) * 43758.5453f)\n\n";
 
 		if (props.domain == glsl::program_domain::glsl_fragment_program)
@@ -641,7 +646,7 @@ namespace glsl
 			OS <<
 			"bool coverage_test_passes(const in vec4 _sample, const in uint control)\n"
 			"{\n"
-			"	if ((control & 0x1u) == 0) return false;\n"
+			"	if (!_test_bit(control, 0)) return false;\n"
 			"\n"
 			"	float random  = _rand(gl_FragCoord);\n"
 			"	return (_sample.a > random);\n"
@@ -671,18 +676,18 @@ namespace glsl
 			"	if (depth_float == 0)\n"
 			"		value = uint(depth_value * 16777215.);\n"
 			"	else\n"
-			"		value = (floatBitsToUint(depth_value) >> 7) & 0xffffff;\n"
+			"		value = _get_bits(floatBitsToUint(depth_value), 7, 24);\n"
 			"\n"
-			"	uint b = (value & 0xff);\n"
-			"	uint g = (value >> 8) & 0xff;\n"
-			"	uint r = (value >> 16) & 0xff;\n"
+			"	uint b = _get_bits(value, 0, 8);\n"
+			"	uint g = _get_bits(value, 8, 8);\n"
+			"	uint r = _get_bits(value, 16, 8);\n"
 			"	return vec4(float(g)/255., float(b)/255., 1., float(r)/255.);\n"
 			"}\n\n"
 
 			"vec4 remap_vector(const in vec4 color, const in uint remap)\n"
 			"{\n"
 			"	vec4 result;\n"
-			"	if ((remap & 0xFF) == 0xE4)\n"
+			"	if (_get_bits(remap, 0, 8) == 0xE4)\n"
 			"	{\n"
 			"		result = color;\n"
 			"	}\n"
@@ -699,7 +704,7 @@ namespace glsl
 			"		result.b = color[remap_channel.b];\n"
 			"	}\n\n"
 
-			"	if ((remap >> 8) == 0xAA)\n"
+			"	if (_get_bits(remap, 8, 8) == 0xAA)\n"
 			"		return result;\n\n"
 
 			"	uvec4 remap_select = uvec4(remap) >> uvec4(10, 12, 14, 8);\n"
@@ -755,7 +760,7 @@ namespace glsl
 			"		return rgba;\n"
 			"	}\n"
 			"\n"
-			"	if ((control_bits & 0x10u) != 0)\n"
+			"	if (_test_bit(control_bits, 4))\n"
 			"	{\n"
 			"		// Alphakill\n"
 			"		if (rgba.a < 0.000001)\n"
@@ -765,7 +770,7 @@ namespace glsl
 			"		}\n"
 			"	}\n"
 			"\n"
-			"	if ((control_bits & 0x20u) != 0)\n"
+			"	if (_test_bit(control_bits, 5))\n"
 			"	{\n"
 			"		// Renormalize to 8-bit (PS3) accuracy\n"
 			"		rgba = floor(rgba * 255.);\n"
@@ -882,9 +887,9 @@ namespace glsl
 		case FUNCTION::FUNCTION_DPH:
 			return "$Ty(dot(vec4($0.xyz, 1.0), $1))";
 		case FUNCTION::FUNCTION_SFL:
-			return "$Ty(0., 0., 0., 0.)";
+			return "$Ty(0.)";
 		case FUNCTION::FUNCTION_STR:
-			return "$Ty(1., 1., 1., 1.)";
+			return "$Ty(1.)";
 		case FUNCTION::FUNCTION_FRACT:
 			return "fract($0)";
 		case FUNCTION::FUNCTION_REFL: