From 473139931a771a6b5d427d825eb2acd246606026 Mon Sep 17 00:00:00 2001
From: Georges Hatem <georges@hatem.ws>
Date: Wed, 17 Apr 2024 22:36:47 +0300
Subject: [PATCH] describe additional failed attempts

---
 entries/ghatem-fpc/README.md                | 27 ++++++++++++++++++---
 entries/ghatem-fpc/src/OneBRCproj.lpr       | 12 +++++++--
 entries/ghatem-fpc/src/baseline.console.pas |  3 +++
 entries/ghatem-fpc/src/onebrc.pas           |  6 ++---
 4 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/entries/ghatem-fpc/README.md b/entries/ghatem-fpc/README.md
index cc8340c..ff32bfe 100644
--- a/entries/ghatem-fpc/README.md
+++ b/entries/ghatem-fpc/README.md
@@ -132,8 +132,9 @@ Instead of extracting the station name as a string 1B times, and use it as a dic
 This requires us to migrate from TFPHashList to the generic TDictionary. Even though TDictionary is slower than TFPHashList, the overall improvements yielded a significant performance gain.
 Using TDictionary, the code is more similar to my Delphi version, in case we get to run those tests on a Windows PC.
 
-* expected timing: ~60 seconds, single-threaded*
-** ACTUAL TIMING: 58 seconds as per gcarreno **
+*expected timing: ~60 seconds, single-threaded*
+
+**ACTUAL TIMING: 58 seconds as per gcarreno**
 
 
 ## Multi-Threaded Attempt (2024-04-10)
@@ -184,7 +185,25 @@ Better wait and see the results on the real environment, before judging.
 
 ### results
 
-** ACTUAL TIMING: 6.042 seconds as per gcarreno **
+**ACTUAL TIMING: 6.042 seconds as per gcarreno**
 
 Due to the unexpectedly slow performance on Craig Chapman's powerful computer, and since the results above intrigued me, I have ported my FPC code onto Delphi to be able to compare the output of both compilers on Windows x64.
-Hopefully, it will help identify is the issue stems from Windows x64 or FPC, in multi-threaded implementations.
\ No newline at end of file
+Hopefully, it will help identify is the issue stems from Windows x64 or FPC, in multi-threaded implementations.
+
+## Multi-Threaded attempt v.2 (2024-04-16)
+
+On my Linux setup (FPC, no VM), I realize that as the number of cores increases, the performance improvement is far from linear:
+
+- 1 core: 77 seconds
+- 2 cores: 50 seconds
+- 4 cores: 35 seconds
+
+In order to identify the source of the problem, I ﬁrst forced all threads to write to un-protected shared-memory (the results are wrong, of course). The idea is to try to understand if the source of the problem stems from ~45k records being created for each thread, and if the retrieval of those records are causing too many cache misses.
+
+with 4 cores, we're now at ~21 seconds, which is much closer to linear performance. In order to make sure the threads are approximately getting a balanced load, each thread is initially assigned a range of data, and as soon as they are done with their given range, they request the next available range. I've tried with varying range sizes, but the result was always slower than just a plain `N / K` distribution.
+
+So the problem (on my computer at least) does not seem to be related to the load imbalance between threads, but rather having to swap all those records from the cache.
+
+As a last attempt, I tried again accumulating data in a shared memory, protecting all data accumulation with `InterlockedInc`, `InterlockedExchangeAdd`, and `TCriticalSection`. In order to avoid too many contentions on the critical section, I also tried to maintain a large array of critical sections, acquiring only the index for which we are accumulating data. All of these attempts under-performed on 4 threads, and likely will perform even worse as thread-count increases. The only way this would work is by having ﬁner-grained control over the locking, such that a thread would only be blocked if it tried to write into a record that is already locked.
+
+Lastly, the `TDictionary.TryGetValue` has shown to be quite costly, around `1/4th` of the total cost. And although it is currently so much better than when using the station name as key, evaluating the `mod` of all those hashes, there is a lot of collisions. So if the dictionary key-storage is implemented as an array, and `mod` is used to transform those `CRC32` into indexes ranging in `[0, 45k]`, those collisions will be the cause of slowness. If there is a way to reduce the number of collisions, then maybe a custom dictionary implementation might help.
diff --git a/entries/ghatem-fpc/src/OneBRCproj.lpr b/entries/ghatem-fpc/src/OneBRCproj.lpr
index 24616e1..6a2a0ff 100644
--- a/entries/ghatem-fpc/src/OneBRCproj.lpr
+++ b/entries/ghatem-fpc/src/OneBRCproj.lpr
@@ -17,6 +17,7 @@
   TOneBRCApp = class(TCustomApplication)
   private
     FFileName: string;
+    FThreadCount: Integer;
     procedure RunOneBRC;
   protected
     procedure DoRun; override;
@@ -34,7 +35,7 @@ procedure TOneBRCApp.RunOneBRC;
   vStart: Int64;
   vTime: Int64;
 begin
-  vOneBRC := TOneBRC.Create (32);
+  vOneBRC := TOneBRC.Create (FThreadCount);
   try
     try
       vOneBRC.mORMotMMF(FFileName);
@@ -89,13 +90,15 @@ procedure TOneBRCApp.DoRun;
   ErrorMsg: String;
 begin
   // quick check parameters
-  ErrorMsg:= CheckOptions(Format('%s%s%s:',[
+  ErrorMsg:= CheckOptions(Format('%s%s%s%s:',[
       cShortOptHelp,
+      cShortOptThread,
       cShortOptVersion,
       cShortOptInput
     ]),
     [
       cLongOptHelp,
+      cLongOptThread+':',
       cLongOptVersion,
       cLongOptInput+':'
     ]
@@ -120,6 +123,11 @@ procedure TOneBRCApp.DoRun;
     Exit;
   end;
 
+  FThreadCount := 32;
+  if HasOption(cShortOptThread, cLongOptThread) then begin
+    FThreadCount := StrToInt (GetOptionValue(cShortOptThread, cLongOptThread));
+  end;
+
   if HasOption(cShortOptInput, cLongOptInput) then begin
     FFileName := GetOptionValue(
       cShortOptInput,
diff --git a/entries/ghatem-fpc/src/baseline.console.pas b/entries/ghatem-fpc/src/baseline.console.pas
index 256a20a..bf961c0 100644
--- a/entries/ghatem-fpc/src/baseline.console.pas
+++ b/entries/ghatem-fpc/src/baseline.console.pas
@@ -19,6 +19,8 @@ interface
   cLongOptVersion        = 'version';
   cShortOptInput: Char   = 'i';
   cLongOptInput          = 'input-file';
+  cShortOptThread: Char  = 't';
+  cLongOptThread         = 'threads';
   {$ELSE}
   cOptionHelp:    array of string = ['-h', '--help'];
   cOptionVersion: array of string = ['-v', '--version'];
@@ -63,6 +65,7 @@ procedure WriteHelp;
   WriteLn('  -h|--help                      Writes this help message and exits');
   WriteLn('  -v|--version                   Writes the version and exits');
   WriteLn('  -i|--input-file <filename>     The file containing the Weather Stations');
+  WriteLn('  -t|--threads <threadcount>     The number of threads to be used, default 32');
 end;
 
 {$IFNDEF FPC}
diff --git a/entries/ghatem-fpc/src/onebrc.pas b/entries/ghatem-fpc/src/onebrc.pas
index 6e1b99f..3b439a5 100644
--- a/entries/ghatem-fpc/src/onebrc.pas
+++ b/entries/ghatem-fpc/src/onebrc.pas
@@ -135,14 +135,14 @@ procedure TOneBRC.ExtractLineData(const aStart: Int64; const aEnd: Int64; out aL
   aTemp :=     (Ord(FData[aEnd])   - c0ascii)
          + 10 *(Ord(FData[aEnd-2]) - c0ascii);
   vDigit := Ord(FData[aEnd-3]);
-  if (vDigit >= c0ascii) and (vDigit <= c9ascii) then begin
+  if vDigit >= c0ascii then begin
     aTemp := aTemp + 100*(Ord(FData[aEnd-3]) - c0ascii);
     vDigit := Ord(FData[aEnd-4]);
     if vDigit = cNegAscii then
-      aTemp := -1 * aTemp;
+      aTemp := -aTemp;
   end
   else if vDigit = cNegAscii then
-    aTemp := -1 * aTemp;
+    aTemp := -aTemp;
 end;
 
 //---------------------------------------------------