From b968ea50f0d797eccc2eef8533dc86732d186eb1 Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Fri, 3 May 2024 23:19:42 +0300 Subject: [PATCH 1/3] replace FormatFloat with own implementation, speeds up the single-threaded part by ~2% --- .../ghatem-fpc/src/OneBRC-nosharedname.lpr | 33 +++++++++++++------ entries/ghatem-fpc/src/OneBRC-smallrec.lpr | 33 +++++++++++++------ entries/ghatem-fpc/src/OneBRC.lpr | 33 +++++++++++++------ 3 files changed, 69 insertions(+), 30 deletions(-) diff --git a/entries/ghatem-fpc/src/OneBRC-nosharedname.lpr b/entries/ghatem-fpc/src/OneBRC-nosharedname.lpr index ef9301d..3d3ce50 100644 --- a/entries/ghatem-fpc/src/OneBRC-nosharedname.lpr +++ b/entries/ghatem-fpc/src/OneBRC-nosharedname.lpr @@ -151,12 +151,12 @@ function Ceiling (const ANumber: Double): Integer; inline; Result := Trunc(ANumber) + Ord(Frac(ANumber) > 0); end; -function RoundExDouble (const aTemp: Double): Double; inline; +function RoundExInteger (const aTemp: Double): Integer; inline; var vTmp: Double; begin vTmp := aTemp * 10; - Result := Ceiling (vTmp) / 10; + Result := Ceiling (vTmp); end; //--------------------------------------------------- @@ -473,8 +473,24 @@ procedure TOneBRC.MergeAll; //--------------------------------------------------- +function MyFormatInt (const aIn: SmallInt): AnsiString; inline; +begin + Result := IntToStr(aIn); + Insert ('.', Result, Length(Result)); + + if Result[1] = '.' then begin + Insert ('0', Result, 1); + exit; + end; + + if (Result[1] = '-') and (Result[2] = '.') then + Insert('0', Result, 2); +end; + +//--------------------------------------------------- + procedure TOneBRC.GenerateOutput; -var vMin, vMean, vMax: Double; +var vMean: Integer; vStream: TStringStream; I, N: Int64; vData: PStationData; @@ -507,14 +523,12 @@ procedure TOneBRC.GenerateOutput; // debatable, and the whole output generation is < 0.3 seconds, so not exactly worth it vHash := crc32c(0, @vStations[i][1], Length (vStations[i])); FStationsDicts[0].TryGetValue(vHash, vData); - vMin := vData^.Min/10; - vMax := vData^.Max/10; - vMean := RoundExDouble(vData^.Sum/vData^.Count/10); + vMean := RoundExInteger(vData^.Sum/vData^.Count/10); vStream.WriteString( - vStations[i] + '=' + FormatFloat('0.0', vMin) - + '/' + FormatFloat('0.0', vMean) - + '/' + FormatFloat('0.0', vMax) + ', ' + vStations[i] + '=' + MyFormatInt(vData^.Min) + + '/' + MyFormatInt(vMean) + + '/' + MyFormatInt(vData^.Max) + ', ' ); Inc(I); end; @@ -645,4 +659,3 @@ procedure TOneBRCApp.WriteHelp; end. {$ENDREGION} - diff --git a/entries/ghatem-fpc/src/OneBRC-smallrec.lpr b/entries/ghatem-fpc/src/OneBRC-smallrec.lpr index f46b861..e4624e4 100644 --- a/entries/ghatem-fpc/src/OneBRC-smallrec.lpr +++ b/entries/ghatem-fpc/src/OneBRC-smallrec.lpr @@ -154,12 +154,12 @@ function Ceiling (const ANumber: Double): Integer; inline; Result := Trunc(ANumber) + Ord(Frac(ANumber) > 0); end; -function RoundExDouble (const aTemp: Double): Double; inline; +function RoundExInteger (const aTemp: Double): Integer; inline; var vTmp: Double; begin vTmp := aTemp * 10; - Result := Ceiling (vTmp) / 10; + Result := Ceiling (vTmp); end; //--------------------------------------------------- @@ -487,8 +487,24 @@ procedure TOneBRC.MergeAll; //--------------------------------------------------- +function MyFormatInt (const aIn: SmallInt): AnsiString; inline; +begin + Result := IntToStr(aIn); + Insert ('.', Result, Length(Result)); + + if Result[1] = '.' then begin + Insert ('0', Result, 1); + exit; + end; + + if (Result[1] = '-') and (Result[2] = '.') then + Insert('0', Result, 2); +end; + +//--------------------------------------------------- + procedure TOneBRC.GenerateOutput; -var vMin, vMean, vMax: Double; +var vMean: Integer; vStream: TStringStream; I, N: Int64; vData: PStationData; @@ -521,14 +537,12 @@ procedure TOneBRC.GenerateOutput; // debatable, and the whole output generation is < 0.3 seconds, so not exactly worth it vHash := crc32c(0, @vStations[i][1], Length (vStations[i])); FStationsDicts[0].TryGetValue(vHash, vData); - vMin := vData^.Min/10; - vMax := vData^.Max/10; - vMean := RoundExDouble(vData^.Sum/vData^.Count/10); + vMean := RoundExInteger(vData^.Sum/vData^.Count/10); vStream.WriteString( - vStations[i] + '=' + FormatFloat('0.0', vMin) - + '/' + FormatFloat('0.0', vMean) - + '/' + FormatFloat('0.0', vMax) + ', ' + vStations[i] + '=' + MyFormatInt(vData^.Min) + + '/' + MyFormatInt(vMean) + + '/' + MyFormatInt(vData^.Max) + ', ' ); Inc(I); end; @@ -659,4 +673,3 @@ procedure TOneBRCApp.WriteHelp; end. {$ENDREGION} - diff --git a/entries/ghatem-fpc/src/OneBRC.lpr b/entries/ghatem-fpc/src/OneBRC.lpr index 09849d1..fc4cdf0 100644 --- a/entries/ghatem-fpc/src/OneBRC.lpr +++ b/entries/ghatem-fpc/src/OneBRC.lpr @@ -154,12 +154,12 @@ function Ceiling (const ANumber: Double): Integer; inline; Result := Trunc(ANumber) + Ord(Frac(ANumber) > 0); end; -function RoundExDouble (const aTemp: Double): Double; inline; +function RoundExInteger (const aTemp: Double): Integer; inline; var vTmp: Double; begin vTmp := aTemp * 10; - Result := Ceiling (vTmp) / 10; + Result := Ceiling (vTmp); end; //--------------------------------------------------- @@ -487,8 +487,24 @@ procedure TOneBRC.MergeAll; //--------------------------------------------------- +function MyFormatInt (const aIn: SmallInt): AnsiString; inline; +begin + Result := IntToStr(aIn); + Insert ('.', Result, Length(Result)); + + if Result[1] = '.' then begin + Insert ('0', Result, 1); + exit; + end; + + if (Result[1] = '-') and (Result[2] = '.') then + Insert('0', Result, 2); +end; + +//--------------------------------------------------- + procedure TOneBRC.GenerateOutput; -var vMin, vMean, vMax: Double; +var vMean: Integer; vStream: TStringStream; I, N: Int64; vData: PStationData; @@ -521,14 +537,12 @@ procedure TOneBRC.GenerateOutput; // debatable, and the whole output generation is < 0.3 seconds, so not exactly worth it vHash := crc32c(0, @vStations[i][1], Length (vStations[i])); FStationsDicts[0].TryGetValue(vHash, vData); - vMin := vData^.Min/10; - vMax := vData^.Max/10; - vMean := RoundExDouble(vData^.Sum/vData^.Count/10); + vMean := RoundExInteger(vData^.Sum/vData^.Count/10); vStream.WriteString( - vStations[i] + '=' + FormatFloat('0.0', vMin) - + '/' + FormatFloat('0.0', vMean) - + '/' + FormatFloat('0.0', vMax) + ', ' + vStations[i] + '=' + MyFormatInt(vData^.Min) + + '/' + MyFormatInt(vMean) + + '/' + MyFormatInt(vData^.Max) + ', ' ); Inc(I); end; @@ -659,4 +673,3 @@ procedure TOneBRCApp.WriteHelp; end. {$ENDREGION} - From 57a626fde07a74fb6b948b1dec629c038ac38ca3 Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sat, 4 May 2024 10:11:02 +0300 Subject: [PATCH 2/3] improve readme --- entries/ghatem-fpc/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md index c526bd5..09da894 100644 --- a/entries/ghatem-fpc/README.md +++ b/entries/ghatem-fpc/README.md @@ -247,3 +247,17 @@ Another trial with various hash functions, a simple modulus vs. a slightly more Can be tested with the HASHMULT build option Finally, it seems choosing a dictionary size that is a prime number is also recommended: shaves 1 second out of 20 on my PC. + +## v.6 (2024-05-04) + +As of the latest results executed by Paweld, there are two main bottlenecks throttling the entire implementation, according to CallGrind and KCacheGrind: + - function ExtractLineData, 23% of total cost, of which 9% is due to `fpc_stackcheck` + - the hash lookup function, at 40% of total cost + +Currently, the hash lookup is done on an array of records. Increasing the array size causes slowness, and reducing it causes further collisions. +Will try to see how to reduce collisions (increase array size), all while minimizing the cost of cache misses. + +For the ExtractLineData, three ideas to try implementing: + - avoid using a function, to get rid of the cost of stack checking + - reduce branching + - unroll the loop (although I had tried this in the past, did not show any improvements) From 47c9b939dcadba0d877eb71b7f9dec3128a9131a Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sat, 4 May 2024 15:16:22 +0300 Subject: [PATCH 3/3] improve readme to document some ideas --- entries/ghatem-fpc/README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md index 09da894..cdce45f 100644 --- a/entries/ghatem-fpc/README.md +++ b/entries/ghatem-fpc/README.md @@ -257,7 +257,21 @@ As of the latest results executed by Paweld, there are two main bottlenecks thro Currently, the hash lookup is done on an array of records. Increasing the array size causes slowness, and reducing it causes further collisions. Will try to see how to reduce collisions (increase array size), all while minimizing the cost of cache misses. +Edit: +The goal is to both: + - minimize collisions on the hashes (keys) by having a good hash function, but also increase the size of the keys storage + - minimize the size of the array of packed records + +The idea: + - the dictionary will no longer point to a PStationData pointer, but rather to an index between 0 and StationCount, where the record is stored in the array. + - -> data about the same station will be stored at the same index for all threads' data-arrays + - -> names will also be stored at that same index upon first encounter, and is common to all threads + - no locking needs to occur when the key is already found, since there is no multiple-write occurring + - the data-arrays are pre-allocated, and a atomic-counter will be incremented to know where the next element will be stored. + +Thinking again, this is likely similar to the approach mentioned by @synopse in one of his comments. + For the ExtractLineData, three ideas to try implementing: - avoid using a function, to get rid of the cost of stack checking - - reduce branching + - reduce branching, I think it should be possible to go from 3 if-statements, to only 1 - unroll the loop (although I had tried this in the past, did not show any improvements)