Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions entries/ghatem-fpc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,17 @@ So the problem (on my computer at least) does not seem to be related to the load
As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having finer-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked.

Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help.


## Multi-Threaded attempt v.3 (2024-04-21)

Using performance profiler ValGrind, it identified that:
- 30% of the time was spent on `TryGetValue` of the generic `TDictionary`.
- 14% of the time is on computing the crc32 hash
- 15% of the time on extracting the line data
- surprisingly, 9% of the time is spent on looking for the #13 (new-line) character

I implemented my own Dictionary class consisting of two arrays. We compute the modulus of the incoming key (Cardinal) to fit it in the correct bucket. A first attempt at collision resolution was to store as values a TList, but performance was worse than the generic TDictionary. Next attempt was a linear probing, with circular indexing in case the index goes out of bounds. Performance improved from 35s to 30s. Will later try quadratic probing, as it apparently reduces clustering.

edit:
quadratic probing improved performance even further. we could probably do better with 2-level hashing, but finding such a hash function is going to take a lot of trials, this is probably acceptable results
107 changes: 103 additions & 4 deletions entries/ghatem-fpc/src/onebrc.pas
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ interface

function RoundExDouble(const ATemp: Double): Double; inline;

const
cDictSize: Integer = 45000;

type

// record is packed to minimize its size
Expand All @@ -24,6 +27,24 @@ function RoundExDouble(const ATemp: Double): Double; inline;
PStationData = ^TStationData;
TStationsDict = specialize TDictionary<Cardinal, PStationData>;

TKeys = array of Cardinal;
TValues = array of PStationData;

{ TMyDictionary }

TMyDictionary = class
private
FHashes: TKeys;
FData : TValues;
procedure InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer); inline;
public
constructor Create;
property Keys: TKeys read FHashes;
property Values: TValues read FData;
function TryGetValue (const aKey: Cardinal; out aValue: PStationData): Boolean; inline;
procedure Add (const aKey: Cardinal; const aValue: PStationData); inline;
end;

{ TOneBRC }

TOneBRC = class
Expand All @@ -36,7 +57,7 @@ TOneBRC = class

FThreadCount: UInt16;
FThreads: array of TThread;
FStationsDicts: array of TStationsDict;
FStationsDicts: array of TMyDictionary;

procedure ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt); inline;

Expand Down Expand Up @@ -106,6 +127,78 @@ function Compare(AList: TStringList; AIndex1, AIndex2: Integer): Integer;
Result := CompareStr(Str1, Str2);
end;

{ TMyDictionary }

procedure TMyDictionary.InternalFind(const aKey: Cardinal; out aFound: Boolean; out aIndex: Integer);
var vIdx: Integer;
vOffset: Integer;
begin
vIdx := aKey mod cDictSize;
aFound := False;

if FHashes[vIdx] = aKey then begin
aIndex := vIdx;
aFound := True;
end
else begin
vOffset := 1;
while True do begin
// quadratic probing, by incrementing vOffset
Inc (vIdx, vOffset);
Inc (vOffset);

// exceeded boundary, loop back
if vIdx >= cDictSize then
Dec (vIdx, cDictSize);

if FHashes[vIdx] = aKey then begin
// found match
aIndex := vIdx;
aFound := True;
break;
end
else if FHashes[vIdx] = 0 then begin
// found empty bucket to use
aIndex := vIdx;
aFound := False;
break;
end;
end;
end;
end;

constructor TMyDictionary.Create;
begin
SetLength (FHashes, cDictSize);
SetLength (FData, cDictSize);
end;

function TMyDictionary.TryGetValue(const aKey: Cardinal; out aValue: PStationData): Boolean;
var
vIdx: Integer;
begin
InternalFind (aKey, Result, vIdx);

if Result then
aValue := FData[vIdx]
else
aValue := nil;
end;

procedure TMyDictionary.Add(const aKey: Cardinal; const aValue: PStationData);
var
vIdx: Integer;
vFound: Boolean;
begin
InternalFind (aKey, vFound, vIdx);
if not vFound then begin
FHashes[vIdx] := aKey;
FData[vIdx] := aValue;
end
else
raise Exception.Create ('TMyDict: cannot add, duplicate key');
end;

procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aLength: ShortInt; out aTemp: SmallInt);
// given a line of data, extract the length of station name, and temperature as Integer.
var
Expand Down Expand Up @@ -155,8 +248,8 @@ constructor TOneBRC.Create (const aThreadCount: UInt16);
SetLength (FThreads, aThreadCount);

for I := 0 to aThreadCount - 1 do begin
FStationsDicts[I] := TStationsDict.Create;
FStationsDicts[I].Capacity := 45000;
FStationsDicts[I] := TMyDictionary.Create;
//FStationsDicts[I].Capacity := 45000;
end;
end;

Expand Down Expand Up @@ -280,6 +373,10 @@ procedure TOneBRC.Merge(aLeft: UInt16; aRight: UInt16);
vDataL: PStationData;
begin
for iHash in FStationsDicts[aRight].Keys do begin
// zero means empty slot: skip
if iHash = 0 then
continue;

FStationsDicts[aRight].TryGetValue(iHash, vDataR);

if FStationsDicts[aLeft].TryGetValue(iHash, vDataL) then begin
Expand Down Expand Up @@ -322,7 +419,9 @@ procedure TOneBRC.GenerateOutput;
try
vStations.BeginUpdate;
for vData in FStationsDicts[0].Values do begin
vStations.Add(vData^.Name);
// nil value means empty slot: skip
if vData <> nil then
vStations.Add(vData^.Name);
end;
vStations.EndUpdate;

Expand Down