diff --git a/entries/hgrosser/README.md b/entries/hgrosser/README.md index 4fa6164..70e8d83 100644 --- a/entries/hgrosser/README.md +++ b/entries/hgrosser/README.md @@ -3,24 +3,26 @@ **1 billion row Challenge entry** ## Version -Version 1.51 +Version 1.60 ## How to compile The program was developed with FPC 3.2.2 and Lazarus 2.2.4 +There is a new Conditional "noCR" which is neccessary, if the input file has no CR's + ## How to start ``` -Usage: -Example: hgrosser measurements.txt 15 +Usage: +Example: hgrosser measurements.txt 16 - bit-width for hash-list: sets the size of the hash list, e.g. '16' => 65536 entries ``` -There are no switches like `-i` etc, only values. +There are no switches like `-i` etc, only 2 values. ### Optimizing the 2nd command line parameter -In theory the program should run faster with greater bit-widths for the hash-list (because of less collisions), but on my own computer (8 GB RAM) in praxis a small value of 15 is the fastest way, allthough this causes many collisions. +In theory the program should run faster with greater bit-widths for the hash-list (because of less collisions), but on my own computer (8 GB RAM) in praxis a small value of 16 is the fastest way, allthough this causes many collisions. -Please (if possible) try all values from 14 to 24 (maybe in a for-loop). Thanks a lot. +Please (if possible) try all values from 16 to 22 (maybe in a for-loop). Thanks a lot. ## How the program works The Program works with 1 thread. @@ -37,3 +39,4 @@ To speed things up: - Version 1.00: initial version - Version 1.50: hash-list optimized, small improvements in parsing the file - Version 1.51: small improvements in asm function +- Version 1.60: hash-list optimized, some minor improvements, Conditional "noCR" added diff --git a/entries/hgrosser/src/1brc.pas b/entries/hgrosser/src/1brc.pas index 7eb5c5c..80bc074 100644 --- a/entries/hgrosser/src/1brc.pas +++ b/entries/hgrosser/src/1brc.pas @@ -4,6 +4,7 @@ { $DEFINE XTEST} {please keep this UNDEFINED / compile a safe or a fast program} { $DEFINE XINLINE} {please keep this UNDEFINED / use INLINE for routines or not} +{ $DEFINE noCR} {neccessary for input-files without CR's (only LF's)} {$IFDEF XTEST} {$R+} {$Q+} {$S+} {slow but safe} @@ -20,14 +21,14 @@ sysutils, strutils, math; const - M_version = '1.51'; {version number} + M_version = '1.60'; {version number} {------------------------------ Common routines: ------------------------------} procedure debug(s: ansistring); - {outputs a debug message} +{outputs a debug message} begin - // writeln(s); +// writeln(s); end; type @@ -105,10 +106,10 @@ procedure error_halt(s: ansistring); {$ASMMODE INTEL} -function FPHashX_ASM(var buf; len: dword): dword; assembler; nostackframe; - {returns a hash code similar to function FPHash() from FPC-Unit 'Contnrs', - but with the difference, that all hash codes for all cities here are UNIQUE. - I: len: MUST BE > 0!} +function FPHashX_ASM(var buf; len: dword): dword; assembler; nostackframe; inline; + {returns a hash code similar to function FPHash() from FPC-Unit 'Contnrs', + but with the difference, that all hash codes for all cities here are UNIQUE. + I: len: MUST BE > 0!} asm MOV EAX,dword(-1) @@ -127,43 +128,42 @@ function FPHashX_ASM(var buf; len: dword): dword; assembler; nostackframe; const MaxCities = 42000; {max. size for array WA[]} + OverlapLen = 60; {Extension to hash list array to avoid wrap-arounds} BufLenDefault_KB = 128; {Default buffer size in kb} - MaxHX = 27000; {size of hash-extension array HX[]} type - cityStr = string[55]; {string for a cityname with temperature (maxlen=55)} - tempTyp = int16; {numeric temperature, multiplied by 10} + cityStr = string[49]; {string for a cityname without temperature} + cityTmpStr = string[55]; {string for a cityname with temperature} + tempTyp = int16; {numeric temperature, multiplied by 10} - weatherRec = record {collects weather data for 1 city: } + weatherRec = packed record {collects weather data for 1 city: } wcity: cityStr; {city name} - wmin: tempTyp; {min. temperature} - wmax: tempTyp; {max. temperature} - wsum: longint; {sums all temperatures} - wcount: word; {number of all temperatures} + wmin: tempTyp; {min. temperature} + wmax: tempTyp; {max. temperature} + wsum: longint; {sums all temperatures} + wcount: word; {number of all temperatures} end; {record} pWeatherRec = ^weatherRec; - - weatherArray = array[0..MaxCities - 1] of weatherRec; {weather data for all cities} + {weather data for all cities: } + weatherArray = packed array[0..MaxCities - 1] of weatherRec; hashTyp = dword; {my hash type} dataTyp = pointer; {data type, returned by myHL_find_and_add()} - nextTyp = ^hashRec; {pointer to next hash entry in array HX[]} hashRec = packed record {an entry in the hash list: } hhash: hashTyp; {the hash code} hdata: dataTyp; {pointer to corresponding array WA[]} - hnext: nextTyp; {pointer to next hash entry in array HX[]} end; var WA: weatherArray; {weather data for all cities} - HL: packed array of hashRec; {stores all primary entries of the hash list} - HX: packed array[1..MaxHX] of hashRec; {extended hash list for doublettes} + HL: packed array of hashRec; {stores all entries of the hash list} {--------------------------------- quicksort: ---------------------------------} function strcmp(const s1, s2: cityStr): integer; -{$IFDEF XINLINE} inline; {$ENDIF} +{$IFDEF XINLINE} inline; +{$ENDIF} {compares 2 strings alphabetically (case-sensitiv). O: -1: s1 < s2 / 0: s1 = s2 / +1: s1 > s2} var @@ -187,9 +187,9 @@ function strcmp(const s1, s2: cityStr): integer; end; {strcmp} type - QS_item = weatherRec; {quicksort item type} + QS_item = weatherRec; {quicksort item type} QS_array = weatherArray; {quicksort array type} - QS_idx = longint; {quicksort index type} + QS_idx = longint; {quicksort index type} function QS_less(var x, y: QS_item): boolean; {checks the sort order of 'x' and 'y'; @@ -199,7 +199,7 @@ function QS_less(var x, y: QS_item): boolean; end; procedure quicksort(var A: QS_array; l, r: QS_idx); - {sorts A[l..r]} +{sorts A[l..r]} procedure sort(l, r: QS_idx); var @@ -234,19 +234,19 @@ procedure quicksort(var A: QS_array; l, r: QS_idx); type weatherClass = class(TObject) fspecIn: ansistring; {file to read} - buflen: longint; {buffer size} + buflen: longint; {buffer size} cntWA: word; {number of valid entries in WA[]} cntMulti: longint; {counts cities when they occur a 2nd time} wsum_min, wsum_max: longint; {only for testing} NumHash: longint; {number of hashes in hash array HL[]} AndHash: longint; {AND-mask for all hash codes} - cntHX: word; {number of used records in HX[]} constructor Create(fspec: ansistring; bufsize: longint; hashBits: integer); destructor Destroy; override; procedure myHL_init(hashBits: integer); - function myHL_find_and_add(out new: boolean): dataTyp; {$IFDEF XINLINE} inline; {$ENDIF} + function myHL_find_and_add(out new: boolean): dataTyp; + {$IFDEF XINLINE} inline; {$ENDIF} procedure myHL_addCityTemp; {$IFDEF XINLINE} inline; {$ENDIF} procedure sort_WA; @@ -258,7 +258,7 @@ weatherClass = class(TObject) end; {Class} constructor weatherClass.Create(fspec: ansistring; bufsize: longint; - hashBits: integer); + hashBits: integer); {I: - fspec: filespec to read. - bufsize: buffer size for above file. - hashBits: bit-width for the hash-array HL[]} @@ -286,24 +286,25 @@ procedure weatherClass.myHL_init(hashBits: integer); {initializes my hash list. I: hashBits: bit-width for the hash-array HL[]} begin + if hashBits < 16 then error_halt('var "hashBits" is too small'); + NumHash := 1 shl hashBits; {max. number of hashes in array HL[]} - AndHash := NumHash - 1; {AND-mask for all hash codes} + AndHash := NumHash - 1; {AND-mask for all hash codes} + Inc(NumHash, OverlapLen); {add extension to array HL[] to avoid wrap-arounds} SetLength(HL, NumHash); - fillchar(HL[0], sizeof(HL[0]) * NumHash, 0); {fills fields 'hdata' and 'hnext'} - - cntHX := 0; {array HX[] is empty} + fillchar(HL[0], sizeof(HL[0]) * NumHash, 0); {fills field 'hdata'} debug('NumHash=' + IntToStr3(NumHash)); debug('sizeof(HL) = ' + IntToStr3(sizeof(HL[0]) * NumHash)); - debug('sizeof(HX) = ' + IntToStr3(sizeof(HX))); end; {myHL_init} var - city: cityStr; {global variables for use in myHL_find_and_add() and} - tmp: tempTyp; {in process_measurements() are faster than parameters} + city: cityTmpStr; {global variables for use in myHL_find_and_add() and} + tmp: tempTyp; {in process_measurements() are faster than parameters} function weatherClass.myHL_find_and_add(out new: boolean): dataTyp; -{$IFDEF XINLINE} inline; {$ENDIF} +{$IFDEF XINLINE} inline; +{$ENDIF} {searches for city 'city' in the hash list HL[] and returns access to it's data in array WA[]. If the city is not found, it is added to the hash list. O: new: was a new entry in array WA[] added? @@ -311,13 +312,14 @@ function weatherClass.myHL_find_and_add(out new: boolean): dataTyp; ATTENTION: this hash list requires, that all hash codes from function FPHashX_ASM() are UNIQUE for all city names!} var - p, p2: ^hashRec; + p: ^hashRec; h: hashTyp; + i: longint; begin h := FPHashX_ASM(city[1], length(city)); {compute unique hash code for 'city'} p := @HL[h and AndHash]; {get Index in HL[0..AndHash]} - if not Assigned(p^.hdata) then {if this entry in HL[] is still free: } + if not Assigned(p^.hdata) then {if this entry in HL[] is still free: } begin new := True; p^.hhash := h; @@ -325,38 +327,40 @@ function weatherClass.myHL_find_and_add(out new: boolean): dataTyp; Inc(cntWA); exit(p^.hdata); {return the new entry in WA[]} end; + {if this entry in HL[] is valid: } + if p^.hhash = h then {if hash matches: } + begin + new := False; + exit(p^.hdata); + end; + + {if hash not matches: } + for i := 1 to OverlapLen do + begin + Inc(p); - repeat {if this entry in HL[] is valid: } + if not Assigned(p^.hdata) then {if this entry in HL[] is still free: } + begin + new := True; + p^.hhash := h; + p^.hdata := @WA[cntWA]; {allocate a new entry in WA[]: } + Inc(cntWA); + exit(p^.hdata); {return the new entry in WA[]} + end; + {if this entry in HL[] is valid: } if p^.hhash = h then {if hash matches: } begin new := False; exit(p^.hdata); end; - {if hash not matches: } - if not Assigned(p^.hnext) then {if no matching entry in HX[] exists: } - begin - new := True; - Inc(cntHX); {allocate a new entry in HX[]: } -{$IFDEF XTEST} - if cntHX > MaxHX then error_halt('Internal Error: cntHX > MaxHX'); -{$ENDIF} - p2 := @HX[cntHX]; - p^.hnext := p2; {create chain from HL[] to HX[]} - - p2^.hhash := h; {fill new entry in HX[]: } - p2^.hnext := nil; - p2^.hdata := @WA[cntWA]; {allocate a new entry in WA[]: } - Inc(cntWA); - exit(p2^.hdata); - end; + end; - p := p^.hnext; {check next entry in HX[]} - until False; + error_halt('const "OverlapLen" too small in myHL_find_and_add()'); end; {myHL_find_and_add} procedure weatherClass.myHL_addCityTemp; - {$IFDEF XINLINE} inline; {$ENDIF} - {stores city 'city' with temperature 'tmp' to array WA[] using my hash list} +{$IFDEF XINLINE} inline; {$ENDIF} +{stores city 'city' with temperature 'tmp' to array WA[] using my hash list} var pWR: pWeatherRec; new: boolean; @@ -389,7 +393,7 @@ procedure weatherClass.myHL_addCityTemp; end; {myHL_addCityTemp} procedure weatherClass.sort_WA; - {sorts WA[0..cntWA-1] by city names} +{sorts WA[0..cntWA-1] by city names} var start: int64; begin @@ -399,7 +403,8 @@ procedure weatherClass.sort_WA; end; function weatherClass.RoundExString(x: double): NumStr; -{$IFDEF XINLINE} inline; {$ENDIF} +{$IFDEF XINLINE} inline; +{$ENDIF} {new official rounding function} var V, Q, R: integer; @@ -420,7 +425,8 @@ function weatherClass.RoundExString(x: double): NumStr; end; {RoundExString} function weatherClass.myRound(sum, Count: longint): NumStr; -{$IFDEF XINLINE} inline; {$ENDIF} +{$IFDEF XINLINE} inline; +{$ENDIF} {using new official rounding function from 26.3.24} var x: double; @@ -452,7 +458,7 @@ procedure weatherClass.save_WA; end; {save_WA} procedure weatherClass.write_WA_STDOUT; - {writes the results in array WA[] via STDOUT to the console} +{writes the results in array WA[] via STDOUT to the console} var fo: Text; s: string; @@ -478,7 +484,7 @@ procedure weatherClass.write_WA_STDOUT; end; {write_WA_STDOUT} procedure weatherClass.process_measurements; - {reads input file 'fspecIn' and processes all measurements} +{reads input file 'fspecIn' and processes all measurements} const LF = #10; {line separator} var @@ -499,7 +505,7 @@ procedure weatherClass.process_measurements; take := buflen; {number of bytes to read} while size > 0 do - begin {if last turn: } + begin {if last turn: } if size < buflen then begin take := size; @@ -513,11 +519,14 @@ procedure weatherClass.process_measurements; if p2 > 0 then begin Inc(p2, pred(p1)); {p2:=position of 'LF'} - - len := p2 - p1 - 1; {get cityname and temperature, without CR: } + {get cityname and temperature, without CR: } + len := p2 - p1 +{$IFNDEF noCR} + - 1; +{$ENDIF} {'-1' skips the CR} city[0] := chr(len); move(s[p1], city[1], len); - {extract temperature: } + {extract temperature: } p := 1 + IndexByte(city[1], length(city), Ord(';')); {seeks for ';'} {$IFDEF XTEST} if p = 0 then error_halt('Internal Error: p=0'); @@ -559,29 +568,29 @@ procedure weatherClass.process_measurements; debug('cntMulti=' + IntToStr3(cntMulti)); {=> 999,958,657} {$IFDEF XTEST} debug('wsum_min=' + IntToStr3(wsum_min) {=> -20,591,902} + - ' wsum_max=' + IntToStr3(wsum_max)); {=> +20,415,562} + ' wsum_max=' + IntToStr3(wsum_max)); {=> +20,415,562} {$ENDIF} sort_WA; {sorts array WA[0..cntWA-1] by city names} - // save_WA; {for Tests: saves the results in array WA[] into a textfile} +//save_WA; {for Tests: saves the results in array WA[] into a textfile} write_WA_STDOUT; {writes the results in array WA[] to STDOUT} end; {process_measurements} {------------------------- End of Class 'weatherClass' ------------------------} const - HashBitsMin = 14; {Min. bit-width for the hash-array HL[]} + HashBitsMin = 16; {Min. bit-width for the hash-array HL[]} HashBitsMax = 28; {Max. bit-width for the hash-array HL[]} procedure syntax_halt; - {shows allowed syntax and halt's the program} +{shows allowed syntax and halt's the program} begin writeln('Purpose: 1 billion row Challenge program by Hartmut Grosser, version ', M_version); writeln('Usage: '); writeln('Example: ', SysUtils.ExtractFileName(ParamStr(0)), - ' measurements.txt 15'); + ' measurements.txt 16'); writeln(' - bit-width for hash-list: sets the size of the hash list, e.g. ''16'' => 65536 entries'); halt(1); end; @@ -592,7 +601,7 @@ procedure syntax_halt; hashBits: integer; {bit-width for the hash-array HL[]} procedure take_command_line_parameters; - {checks command line parameters. Errors => show message and halt} +{checks command line parameters. Errors => show message and halt} var p, code: integer; begin @@ -632,7 +641,7 @@ procedure take_command_line_parameters; // fspecInp:='/media/H/tmp/1brc/measurements.txt'; // buflen:=BufLenDefault_KB * 1024; - // hashBits:=15; + // hashBits:=16; start := GetTickCount64;