From f211b467ff0714562516c2e165bca7d6d7d04886 Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 14:54:52 -0800 Subject: [PATCH 1/8] test --- scripts/clusterfuzz/extract_wasms.py | 32 ++++++++++++++++++++-------- test/lit/scripts/extract_wasms.js | 10 +++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 test/lit/scripts/extract_wasms.js diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index 9f364d7cc34..3ec7935537f 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -20,7 +20,7 @@ That will find embedded wasm files in INFILE.js, of the form - var .. = new Uint8Array([..wasm_contents..]); + new Uint8Array([..wasm_contents..]); and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits OUTFILE.js which will no longer contain the embedded contents, after which the @@ -53,21 +53,35 @@ def get_wasm_filename(): def repl(text): # We found something of the form # - # var binary = new Uint8Array([..binary data as numbers..]); + # new Uint8Array([..binary data as numbers..]); # - # Parse out the numbers into a binary wasm file. + # See if the numbers are the beginnings of a wasm file, "\0asm". If so, we + # assume it is wasm. numbers = text.groups()[0] numbers = numbers.split(',') - numbers = [int(n) for n in numbers] + + # Handle both base 10 and 16. + try: + numbers = [int(n) for n in numbers] + binary = bytes(numbers) + except ValueError: + # Not wasm; return the existing text. + return text + + if binary[:4] != b'\0asm': + return text + + # It is wasm. Parse out the numbers into a binary wasm file. with open(get_wasm_filename(), 'wb') as f: - f.write(bytes(numbers)) + f.write(binary) - # Replace it with nothing. - return '' + # Replace the Uint8Array with undefined + a comment. + return 'undefined /* extracted wasm */' -# Replace the wasm files and write them out. -js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\)', repl, js) +# Replace the wasm files and write them out. We investigate any new Uint8Array +# on an array of values like [100, 200] or [0x61, 0x6D, 0x6a] etc. +js = re.sub(r'new Uint8Array\(\[([\d,x a-fA-F]+)\]\)', repl, js) # Write out the new JS. with open(f'{out}.js', 'w') as f: diff --git a/test/lit/scripts/extract_wasms.js b/test/lit/scripts/extract_wasms.js new file mode 100644 index 00000000000..358f8f4c7bf --- /dev/null +++ b/test/lit/scripts/extract_wasms.js @@ -0,0 +1,10 @@ +;; Test extracting wasm files from JS. + +const v101 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([ + 0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x25, + +;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %s %t +;; RUN: cat %t.js | filecheck %s +;; +;; CHECK: AAAAAAAAAAAAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + From 273ed35a12f09ab777437800f48601367d6bc211 Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 15:04:07 -0800 Subject: [PATCH 2/8] work --- scripts/clusterfuzz/extract_wasms.py | 15 +++++++++++---- test/lit/scripts/extract_wasms.js | 3 +-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index 3ec7935537f..201694abc36 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -50,25 +50,32 @@ def get_wasm_filename(): js = f.read() -def repl(text): +def repl(match): + text = match.group(0) + print('text', text) + # We found something of the form # # new Uint8Array([..binary data as numbers..]); # # See if the numbers are the beginnings of a wasm file, "\0asm". If so, we # assume it is wasm. - numbers = text.groups()[0] + numbers = match.groups()[0] numbers = numbers.split(',') + print('numbers', numbers) # Handle both base 10 and 16. try: - numbers = [int(n) for n in numbers] - binary = bytes(numbers) + parsed = [int(n, 0) for n in numbers] + print('parsed', parsed) + binary = bytes(parsed) + print('binary', binary) except ValueError: # Not wasm; return the existing text. return text if binary[:4] != b'\0asm': + print('sad') return text # It is wasm. Parse out the numbers into a binary wasm file. diff --git a/test/lit/scripts/extract_wasms.js b/test/lit/scripts/extract_wasms.js index 358f8f4c7bf..9c63686cbe3 100644 --- a/test/lit/scripts/extract_wasms.js +++ b/test/lit/scripts/extract_wasms.js @@ -1,7 +1,6 @@ ;; Test extracting wasm files from JS. -const v101 = new WebAssembly.Instance(new WebAssembly.Module(new Uint8Array([ - 0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x25, +foo(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01])); ;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %s %t ;; RUN: cat %t.js | filecheck %s From e0590ba3b7b480e6bb79c6ff8dcc75f11e05797a Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 16:22:57 -0800 Subject: [PATCH 3/8] sad --- scripts/clusterfuzz/extract_wasms.py | 2 ++ test/lit/scripts/extract_wasms.js | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index 201694abc36..ee8d58b268d 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -93,3 +93,5 @@ def repl(match): # Write out the new JS. with open(f'{out}.js', 'w') as f: f.write(js) + +print('NONONONONONONO') diff --git a/test/lit/scripts/extract_wasms.js b/test/lit/scripts/extract_wasms.js index 9c63686cbe3..bd5ac36a322 100644 --- a/test/lit/scripts/extract_wasms.js +++ b/test/lit/scripts/extract_wasms.js @@ -5,5 +5,5 @@ foo(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01])); ;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %s %t ;; RUN: cat %t.js | filecheck %s ;; -;; CHECK: AAAAAAAAAAAAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +;; CHECK: foo(undefined /* extracted wasm */) From 7502e7afdbda715d618f725856a8c1e0a491d557 Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 16:27:35 -0800 Subject: [PATCH 4/8] test --- test/lit/scripts/extract_wasms.js | 9 --------- test/lit/scripts/extract_wasms.lit | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) delete mode 100644 test/lit/scripts/extract_wasms.js create mode 100644 test/lit/scripts/extract_wasms.lit diff --git a/test/lit/scripts/extract_wasms.js b/test/lit/scripts/extract_wasms.js deleted file mode 100644 index bd5ac36a322..00000000000 --- a/test/lit/scripts/extract_wasms.js +++ /dev/null @@ -1,9 +0,0 @@ -;; Test extracting wasm files from JS. - -foo(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01])); - -;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %s %t -;; RUN: cat %t.js | filecheck %s -;; -;; CHECK: foo(undefined /* extracted wasm */) - diff --git a/test/lit/scripts/extract_wasms.lit b/test/lit/scripts/extract_wasms.lit new file mode 100644 index 00000000000..b4b3e7d712f --- /dev/null +++ b/test/lit/scripts/extract_wasms.lit @@ -0,0 +1,23 @@ +;; Test extracting wasm files from JS. + +;; A proper wasm start sequence (\0asm), so we will extract it. +;; RUN: echo "good1(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" > %t.js + +;; A difference in the second byte, so we won't. +;; RUN: echo "bad1(new Uint8Array([0x00, 0xff, 0x73, 0x6D, 0x01]));" >> %t.js + +;; The last byte is unparseable as an integer, so we won't. +;; RUN: echo "bad2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 6Dx0]));" >> %t.js + +;; Another proper one. +;; RUN: echo "good2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" >> %t.js + +;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %t.js %t.out +;; RUN: cat %t.out.js | filecheck %s +;; +;; We extracted the good but not the bad. +;; CHECK: good1(undefined /* extracted wasm */) +;; CHECK: bad1(new Uint8Array +;; CHECK: bad2(new Uint8Array +;; CHECK: good2(undefined /* extracted wasm */) + From 8accc8fc55d338cfe4abe1d018cf0519edb0595e Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 16:30:38 -0800 Subject: [PATCH 5/8] work --- scripts/clusterfuzz/extract_wasms.py | 2 +- test/lit/scripts/extract_wasms.lit | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index ee8d58b268d..6e886422780 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -64,8 +64,8 @@ def repl(match): numbers = numbers.split(',') print('numbers', numbers) - # Handle both base 10 and 16. try: + # Handle both base 10 and 16 by passing in base 0. parsed = [int(n, 0) for n in numbers] print('parsed', parsed) binary = bytes(parsed) diff --git a/test/lit/scripts/extract_wasms.lit b/test/lit/scripts/extract_wasms.lit index b4b3e7d712f..5284fec7cf6 100644 --- a/test/lit/scripts/extract_wasms.lit +++ b/test/lit/scripts/extract_wasms.lit @@ -9,8 +9,9 @@ ;; The last byte is unparseable as an integer, so we won't. ;; RUN: echo "bad2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 6Dx0]));" >> %t.js -;; Another proper one. -;; RUN: echo "good2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" >> %t.js +;; Another proper one. Note the second number is in base 10, which works too, +;; there is various odd whitespace which we ignore. +;; RUN: echo "good2(new Uint8Array([0x00,97, 0x73, 0x6D,0x01]));" >> %t.js ;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %t.js %t.out ;; RUN: cat %t.out.js | filecheck %s From 763b6ec635e9707211f61af8b815a48af0912a98 Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 16:32:02 -0800 Subject: [PATCH 6/8] work --- scripts/clusterfuzz/extract_wasms.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index 6e886422780..c1cc429eeb6 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -14,7 +14,10 @@ # limitations under the License. ''' -Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage: +Wasm extractor for testcases generated by the ClusterFuzz run.py script. This is +general enough to also handle Fuzzilli output. + +Usage: extract_wasms.py INFILE.js OUTFILE @@ -52,30 +55,26 @@ def get_wasm_filename(): def repl(match): text = match.group(0) - print('text', text) # We found something of the form # # new Uint8Array([..binary data as numbers..]); # # See if the numbers are the beginnings of a wasm file, "\0asm". If so, we - # assume it is wasm. + # assume it is wasm. (We are careful here because Fuzzilli output can + # contain normal JavaScript Typed Arrays, which we do not want to touch.) numbers = match.groups()[0] numbers = numbers.split(',') - print('numbers', numbers) try: # Handle both base 10 and 16 by passing in base 0. parsed = [int(n, 0) for n in numbers] - print('parsed', parsed) binary = bytes(parsed) - print('binary', binary) except ValueError: # Not wasm; return the existing text. return text if binary[:4] != b'\0asm': - print('sad') return text # It is wasm. Parse out the numbers into a binary wasm file. @@ -93,5 +92,3 @@ def repl(match): # Write out the new JS. with open(f'{out}.js', 'w') as f: f.write(js) - -print('NONONONONONONO') From f225de3294cc03a23c7240a28a321d2f162be1fc Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Wed, 29 Jan 2025 16:40:49 -0800 Subject: [PATCH 7/8] test --- test/lit/scripts/extract_wasms.lit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lit/scripts/extract_wasms.lit b/test/lit/scripts/extract_wasms.lit index 5284fec7cf6..a1e48263684 100644 --- a/test/lit/scripts/extract_wasms.lit +++ b/test/lit/scripts/extract_wasms.lit @@ -10,7 +10,7 @@ ;; RUN: echo "bad2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 6Dx0]));" >> %t.js ;; Another proper one. Note the second number is in base 10, which works too, -;; there is various odd whitespace which we ignore. +;; & there is various odd whitespace which we also ignore. ;; RUN: echo "good2(new Uint8Array([0x00,97, 0x73, 0x6D,0x01]));" >> %t.js ;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %t.js %t.out From 873bae17598dd7e4d2187078dfcec17a970fd12d Mon Sep 17 00:00:00 2001 From: Alon Zakai Date: Thu, 30 Jan 2025 11:19:16 -0800 Subject: [PATCH 8/8] mtest --- test/lit/scripts/extract_wasms.lit | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/lit/scripts/extract_wasms.lit b/test/lit/scripts/extract_wasms.lit index a1e48263684..fee45071bbd 100644 --- a/test/lit/scripts/extract_wasms.lit +++ b/test/lit/scripts/extract_wasms.lit @@ -9,6 +9,9 @@ ;; The last byte is unparseable as an integer, so we won't. ;; RUN: echo "bad2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 6Dx0]));" >> %t.js +;; This is not a Uint8Array, so we do nothing. +;; RUN: echo "bad3(new Uint16Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" >> %t.js + ;; Another proper one. Note the second number is in base 10, which works too, ;; & there is various odd whitespace which we also ignore. ;; RUN: echo "good2(new Uint8Array([0x00,97, 0x73, 0x6D,0x01]));" >> %t.js @@ -20,5 +23,6 @@ ;; CHECK: good1(undefined /* extracted wasm */) ;; CHECK: bad1(new Uint8Array ;; CHECK: bad2(new Uint8Array +;; CHECK: bad3(new Uint16Array ;; CHECK: good2(undefined /* extracted wasm */)