diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md index ff32bfe..c170935 100644 --- a/entries/ghatem-fpc/README.md +++ b/entries/ghatem-fpc/README.md @@ -207,3 +207,17 @@ So the problem (on my computer at least) does not seem to be related to the load As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having finer-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked. Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help. + + +## Multi-Threaded attempt v.3 (2024-04-21) + +Using performance profiler ValGrind, it identified that: + - 30% of the time was spent on `TryGetValue` of the generic `TDictionary`. + - 14% of the time is on computing the crc32 hash + - 15% of the time on extracting the line data + - surprisingly, 9% of the time is spent on looking for the #13 (new-line) character + +I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary. Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s. Will later try quadratic probing, as it apparently reduces clustering. + +edit: +quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas index 1aa61db..77aca6f 100644 --- a/entries/ghatem-fpc/src/onebrc.pas +++ b/entries/ghatem-fpc/src/onebrc.pas @@ -10,6 +10,9 @@ interface function RoundExDouble(const ATemp: Double): Double; inline; +const + cDictSize: Integer = 45000; + type // record is packed to minimize its size @@ -24,6 +27,24 @@ function RoundExDouble(const ATemp: Double): Double; inline; PStationData = ^TStationData; TStationsDict = specialize TDictionary; + TKeys = array of Cardinal; + TValues = array of PStationData; + + { TMyDictionary } + + TMyDictionary = class + private + FHashes: TKeys; + FData : TValues; + procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); inline; + public + constructor Create; + property Keys: TKeys read FHashes; + property Values: TValues read FData; + function TryGetValue (const aKey: Cardinal; out aValue: PStationData): Boolean; inline; + procedure Add (const aKey: Cardinal; const aValue: PStationData); inline; + end; + { TOneBRC } TOneBRC = class @@ -36,7 +57,7 @@ TOneBRC = class FThreadCount: UInt16; FThreads: array of TThread; - FStationsDicts: array of TStationsDict; + FStationsDicts: array of TMyDictionary; procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline; @@ -106,6 +127,78 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer; Result := CompareStr(Str1, Str2); end; +{ TMyDictionary } + +procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); +var vIdx: Integer; + vOffset: Integer; +begin + vIdx := aKey mod cDictSize; + aFound := False; + + if FHashes[vIdx] = aKey then begin + aIndex := vIdx; + aFound := True; + end + else begin + vOffset := 1; + while True do begin + // quadratic probing, by incrementing vOffset + Inc (vIdx, vOffset); + Inc (vOffset); + + // exceeded boundary, loop back + if vIdx >= cDictSize then + Dec (vIdx, cDictSize); + + if FHashes[vIdx] = aKey then begin + // found match + aIndex := vIdx; + aFound := True; + break; + end + else if FHashes[vIdx] = 0 then begin + // found empty bucket to use + aIndex := vIdx; + aFound := False; + break; + end; + end; + end; +end; + +constructor TMyDictionary.Create; +begin + SetLength (FHashes, cDictSize); + SetLength (FData, cDictSize); +end; + +function TMyDictionary.TryGetValue(const aKey: Cardinal; out aValue: PStationData): Boolean; +var + vIdx: Integer; +begin + InternalFind (aKey, Result, vIdx); + + if Result then + aValue := FData[vIdx] + else + aValue := nil; +end; + +procedure TMyDictionary.Add(const aKey: Cardinal; const aValue: PStationData); +var + vIdx: Integer; + vFound: Boolean; +begin + InternalFind (aKey, vFound, vIdx); + if not vFound then begin + FHashes[vIdx] := aKey; + FData[vIdx] := aValue; + end + else + raise Exception.Create ('TMyDict: cannot add, duplicate key'); +end; + procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); // given a line of data, extract the length of station name, and temperature as Integer. var @@ -155,8 +248,8 @@ constructor TOneBRC.Create (const aThreadCount: UInt16); SetLength (FThreads, aThreadCount); for I := 0 to aThreadCount - 1 do begin - FStationsDicts[I] := TStationsDict.Create; - FStationsDicts[I].Capacity := 45000; + FStationsDicts[I] := TMyDictionary.Create; + //FStationsDicts[I].Capacity := 45000; end; end; @@ -280,6 +373,10 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16); vDataL: PStationData; begin for iHash in FStationsDicts[aRight].Keys do begin + // zero means empty slot: skip + if iHash = 0 then + continue; + FStationsDicts[aRight].TryGetValue(iHash, vDataR); if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin @@ -322,7 +419,9 @@ procedure TOneBRC.GenerateOutput; try vStations.BeginUpdate; for vData in FStationsDicts[0].Values do begin - vStations.Add(vData^.Name); + // nil value means empty slot: skip + if vData <> nil then + vStations.Add(vData^.Name); end; vStations.EndUpdate;