From 760539e8cc570beaca3550a31baf4a0ac7feb8c4 Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sun, 21 Apr 2024 21:08:46 +0300 Subject: [PATCH 1/4] own dictionary implementation, apparently has better collision resolution. 35s -> 30s --- entries/ghatem-fpc/src/onebrc.pas | 94 +++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas index 1aa61db..e8521fe 100644 --- a/entries/ghatem-fpc/src/onebrc.pas +++ b/entries/ghatem-fpc/src/onebrc.pas @@ -10,6 +10,9 @@ interface function RoundExDouble(const ATemp: Double): Double; inline; +const + cDictSize: Integer = 45000; + type // record is packed to minimize its size @@ -24,6 +27,24 @@ function RoundExDouble(const ATemp: Double): Double; inline; PStationData = ^TStationData; TStationsDict = specialize TDictionary; + TKeys = array of Cardinal; + TValues = array of PStationData; + + { TMyDictionary } + + TMyDictionary = class + private + FHashes: TKeys; + FData : TValues; + procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); inline; + public + constructor Create; + property Keys: TKeys read FHashes; + property Values: TValues read FData; + function TryGetValue (const aKey: Cardinal; out aValue: PStationData): Boolean; inline; + procedure Add (const aKey: Cardinal; const aValue: PStationData); inline; + end; + { TOneBRC } TOneBRC = class @@ -36,7 +57,7 @@ TOneBRC = class FThreadCount: UInt16; FThreads: array of TThread; - FStationsDicts: array of TStationsDict; + FStationsDicts: array of TMyDictionary; procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline; @@ -106,6 +127,69 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer; Result := CompareStr(Str1, Str2); end; +{ TMyDictionary } + +procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); +var vIdx: Integer; +begin + vIdx := aKey mod cDictSize; + aFound := False; + + if FHashes[vIdx] = aKey then begin + aIndex := vIdx; + aFound := True; + end + else begin + while True do begin + Inc (vIdx); + if vIdx >= cDictSize then + Dec (vIdx, cDictSize); + if FHashes[vIdx] = aKey then begin + aIndex := vIdx; + aFound := True; + break; + end + else if FHashes[vIdx] = 0 then begin + aIndex := vIdx; + aFound := False; + break; + end; + end; + end; +end; + +constructor TMyDictionary.Create; +begin + SetLength (FHashes, cDictSize); + SetLength (FData, cDictSize); +end; + +function TMyDictionary.TryGetValue(const aKey: Cardinal; out aValue: PStationData): Boolean; +var + vIdx: Integer; +begin + InternalFind (aKey, Result, vIdx); + + if Result then + aValue := FData[vIdx] + else + aValue := nil; +end; + +procedure TMyDictionary.Add(const aKey: Cardinal; const aValue: PStationData); +var + vIdx: Integer; + vFound: Boolean; +begin + InternalFind (aKey, vFound, vIdx); + if not vFound then begin + FHashes[vIdx] := aKey; + FData[vIdx] := aValue; + end + else + raise Exception.Create ('TMyDict: cannot add, duplicate key'); +end; + procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); // given a line of data, extract the length of station name, and temperature as Integer. var @@ -155,8 +239,8 @@ constructor TOneBRC.Create (const aThreadCount: UInt16); SetLength (FThreads, aThreadCount); for I := 0 to aThreadCount - 1 do begin - FStationsDicts[I] := TStationsDict.Create; - FStationsDicts[I].Capacity := 45000; + FStationsDicts[I] := TMyDictionary.Create; + //FStationsDicts[I].Capacity := 45000; end; end; @@ -280,6 +364,7 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16); vDataL: PStationData; begin for iHash in FStationsDicts[aRight].Keys do begin + if iHash = 0 then continue; FStationsDicts[aRight].TryGetValue(iHash, vDataR); if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin @@ -322,7 +407,8 @@ procedure TOneBRC.GenerateOutput; try vStations.BeginUpdate; for vData in FStationsDicts[0].Values do begin - vStations.Add(vData^.Name); + if vData <> nil then + vStations.Add(vData^.Name); end; vStations.EndUpdate; From 2c2c277003871d99353e507a9848a8c08350f512 Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sun, 21 Apr 2024 21:24:54 +0300 Subject: [PATCH 2/4] added documentation --- entries/ghatem-fpc/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md index ff32bfe..d69f3c7 100644 --- a/entries/ghatem-fpc/README.md +++ b/entries/ghatem-fpc/README.md @@ -207,3 +207,14 @@ So the problem (on my computer at least) does not seem to be related to the load As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having finer-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked. Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help. + + +## Multi-Threaded attempt v.3 (2024-04-21) + +Using performance profiler ValGrind, it identified that: + - 30% of the time was spent on `TryGetValue` of the generic `TDictionary`. + - 14% of the time is on computing the crc32 hash + - 15% of the time on extracting the line data + - surprisingly, 9% of the time is spent on looking for the #13 (new-line) character + +I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary. Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s. Will later try quadratic probing, as it apparently reduces clustering. From 290fd9160f623187f8e7525ea750d48679ce4acb Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sun, 21 Apr 2024 21:42:43 +0300 Subject: [PATCH 3/4] implement quadratic probing, add comments --- entries/ghatem-fpc/src/onebrc.pas | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas index e8521fe..77aca6f 100644 --- a/entries/ghatem-fpc/src/onebrc.pas +++ b/entries/ghatem-fpc/src/onebrc.pas @@ -131,6 +131,7 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer; procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); var vIdx: Integer; + vOffset: Integer; begin vIdx := aKey mod cDictSize; aFound := False; @@ -140,16 +141,24 @@ procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; aFound := True; end else begin + vOffset := 1; while True do begin - Inc (vIdx); + // quadratic probing, by incrementing vOffset + Inc (vIdx, vOffset); + Inc (vOffset); + + // exceeded boundary, loop back if vIdx >= cDictSize then Dec (vIdx, cDictSize); + if FHashes[vIdx] = aKey then begin + // found match aIndex := vIdx; aFound := True; break; end else if FHashes[vIdx] = 0 then begin + // found empty bucket to use aIndex := vIdx; aFound := False; break; @@ -364,7 +373,10 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16); vDataL: PStationData; begin for iHash in FStationsDicts[aRight].Keys do begin - if iHash = 0 then continue; + // zero means empty slot: skip + if iHash = 0 then + continue; + FStationsDicts[aRight].TryGetValue(iHash, vDataR); if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin @@ -407,6 +419,7 @@ procedure TOneBRC.GenerateOutput; try vStations.BeginUpdate; for vData in FStationsDicts[0].Values do begin + // nil value means empty slot: skip if vData <> nil then vStations.Add(vData^.Name); end; From a8a3cab48275a54d0c46f5a8583c24ee251e9c75 Mon Sep 17 00:00:00 2001 From: Georges Hatem Date: Sun, 21 Apr 2024 21:44:49 +0300 Subject: [PATCH 4/4] readme details --- entries/ghatem-fpc/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md index d69f3c7..c170935 100644 --- a/entries/ghatem-fpc/README.md +++ b/entries/ghatem-fpc/README.md @@ -218,3 +218,6 @@ Using performance profiler ValGrind, it identified that: - surprisingly, 9% of the time is spent on looking for the #13 (new-line) character I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary. Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s. Will later try quadratic probing, as it apparently reduces clustering. + +edit: +quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results