diff --git a/scripts/clusterfuzz/extract_wasms.py b/scripts/clusterfuzz/extract_wasms.py index 9f364d7cc34..c1cc429eeb6 100644 --- a/scripts/clusterfuzz/extract_wasms.py +++ b/scripts/clusterfuzz/extract_wasms.py @@ -14,13 +14,16 @@ # limitations under the License. ''' -Wasm extractor for testcases generated by the ClusterFuzz run.py script. Usage: +Wasm extractor for testcases generated by the ClusterFuzz run.py script. This is +general enough to also handle Fuzzilli output. + +Usage: extract_wasms.py INFILE.js OUTFILE That will find embedded wasm files in INFILE.js, of the form - var .. = new Uint8Array([..wasm_contents..]); + new Uint8Array([..wasm_contents..]); and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits OUTFILE.js which will no longer contain the embedded contents, after which the @@ -50,24 +53,41 @@ def get_wasm_filename(): js = f.read() -def repl(text): +def repl(match): + text = match.group(0) + # We found something of the form # - # var binary = new Uint8Array([..binary data as numbers..]); + # new Uint8Array([..binary data as numbers..]); # - # Parse out the numbers into a binary wasm file. - numbers = text.groups()[0] + # See if the numbers are the beginnings of a wasm file, "\0asm". If so, we + # assume it is wasm. (We are careful here because Fuzzilli output can + # contain normal JavaScript Typed Arrays, which we do not want to touch.) + numbers = match.groups()[0] numbers = numbers.split(',') - numbers = [int(n) for n in numbers] + + try: + # Handle both base 10 and 16 by passing in base 0. + parsed = [int(n, 0) for n in numbers] + binary = bytes(parsed) + except ValueError: + # Not wasm; return the existing text. + return text + + if binary[:4] != b'\0asm': + return text + + # It is wasm. Parse out the numbers into a binary wasm file. with open(get_wasm_filename(), 'wb') as f: - f.write(bytes(numbers)) + f.write(binary) - # Replace it with nothing. - return '' + # Replace the Uint8Array with undefined + a comment. + return 'undefined /* extracted wasm */' -# Replace the wasm files and write them out. -js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\)', repl, js) +# Replace the wasm files and write them out. We investigate any new Uint8Array +# on an array of values like [100, 200] or [0x61, 0x6D, 0x6a] etc. +js = re.sub(r'new Uint8Array\(\[([\d,x a-fA-F]+)\]\)', repl, js) # Write out the new JS. with open(f'{out}.js', 'w') as f: diff --git a/test/lit/scripts/extract_wasms.lit b/test/lit/scripts/extract_wasms.lit new file mode 100644 index 00000000000..fee45071bbd --- /dev/null +++ b/test/lit/scripts/extract_wasms.lit @@ -0,0 +1,28 @@ +;; Test extracting wasm files from JS. + +;; A proper wasm start sequence (\0asm), so we will extract it. +;; RUN: echo "good1(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" > %t.js + +;; A difference in the second byte, so we won't. +;; RUN: echo "bad1(new Uint8Array([0x00, 0xff, 0x73, 0x6D, 0x01]));" >> %t.js + +;; The last byte is unparseable as an integer, so we won't. +;; RUN: echo "bad2(new Uint8Array([0x00, 0x61, 0x73, 0x6D, 6Dx0]));" >> %t.js + +;; This is not a Uint8Array, so we do nothing. +;; RUN: echo "bad3(new Uint16Array([0x00, 0x61, 0x73, 0x6D, 0x01]));" >> %t.js + +;; Another proper one. Note the second number is in base 10, which works too, +;; & there is various odd whitespace which we also ignore. +;; RUN: echo "good2(new Uint8Array([0x00,97, 0x73, 0x6D,0x01]));" >> %t.js + +;; RUN: python %S/../../../scripts/clusterfuzz/extract_wasms.py %t.js %t.out +;; RUN: cat %t.out.js | filecheck %s +;; +;; We extracted the good but not the bad. +;; CHECK: good1(undefined /* extracted wasm */) +;; CHECK: bad1(new Uint8Array +;; CHECK: bad2(new Uint8Array +;; CHECK: bad3(new Uint16Array +;; CHECK: good2(undefined /* extracted wasm */) +