WebAssembly · kripken · Jan 7, 2025 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -71,6 +71,7 @@
 '''
 
 import os
+import subprocess
 import sys
 import tarfile
 
@@ -87,7 +88,9 @@
     # Delete the argument, as importing |shared| scans it.
     sys.argv.pop()
 
+from test import fuzzing # noqa
 from test import shared # noqa
+from test import support # noqa
 
 # Pick where to get the builds
 if build_dir:
@@ -97,6 +100,14 @@
     binaryen_bin = shared.options.binaryen_bin
     binaryen_lib = shared.options.binaryen_lib
 
+# ClusterFuzz's run.py uses these features. Keep this in sync with that, so that
+# we only bundle initial content that makes sense for it.
+features = [
+    '-all',
+    '--disable-shared-everything',
+    '--disable-fp16',
+]
+
 with tarfile.open(output_file, "w:gz") as tar:
     # run.py
     run = os.path.join(shared.options.binaryen_root, 'scripts', 'clusterfuzz', 'run.py')
@@ -128,6 +139,40 @@
                     print(f'  ......... : {path}')
                     tar.add(path, arcname=f'lib/{name}')
 
+    # Add tests we will use as initial content under initial/. We put all the
+    # tests from the test suite there.
+    print('  .. initial content: ')
+    temp_wasm = 'temp.wasm'
+    index = 0
+    all_tests = shared.get_all_tests()
+    for i, test in enumerate(all_tests):
+        if not fuzzing.is_fuzzable(test):
+            continue
+        for wast, asserts in support.split_wast(test):
+            if not wast:
+                continue
+            support.write_wast(temp_wasm, wast)
+            # If the file is not valid for our features, skip it. In the same
+            # operation, also convert to binary if this was text (binary is more
+            # compact).
+            cmd = shared.WASM_OPT + ['-q', temp_wasm, '-o', temp_wasm] + features
+            if subprocess.run(cmd, stderr=subprocess.PIPE).returncode:
+                continue
+
+            # Looks good.
+            tar.add(temp_wasm, arcname=f'initial/{index}.wasm')
+            index += 1
+        print(f'\r        {100 * i / len(all_tests):.2f}%', end='', flush=True)
+    print(f'        (num: {index})')
+
+    # Write initial/num.txt which contains the number of testcases in that
+    # directory (saves run.py from needing to listdir each time).
+    num_txt = 'num.txt'
+    with open(num_txt, 'w') as f:
+        f.write(f'{index}')
+    tar.add(num_txt, arcname='initial/num.txt')
+
+
 print('Done.')
 print('To run the tests on this bundle, do:')
 print()

@@ -67,7 +67,7 @@ def repl(text):
 
 
 # Replace the wasm files and write them out.
-js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
+js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\)', repl, js)
 
 # Write out the new JS.
 with open(f'{out}.js', 'w') as f:

@@ -68,6 +68,12 @@
 # testcase.
 JS_SHELL_PATH = os.path.join(ROOT_DIR, 'scripts', 'fuzz_shell.js')
 
+# The path to the directory with initial contents.
+INITIAL_CONTENT_PATH = os.path.join(ROOT_DIR, 'initial')
+
+# The file that contains the number of initial contents
+INITIAL_CONTENT_NUM_PATH = os.path.join(ROOT_DIR, 'initial', 'num.txt')
+
 # The arguments we provide to wasm-opt to generate wasm files.
 FUZZER_ARGS = [
     # Generate a wasm from random data.
@@ -76,7 +82,8 @@
     '--fuzz-passes',
     # Enable all features but disable ones not yet ready for fuzzing. This may
     # be a smaller set than fuzz_opt.py, as that enables a few experimental
-    # flags, while here we just fuzz with d8's --wasm-staging.
+    # flags, while here we just fuzz with d8's --wasm-staging. This should be
+    # synchonized with bundle_clusterfuzz.
     '-all',
     '--disable-shared-everything',
     '--disable-fp16',
@@ -92,6 +99,17 @@ def get_file_name(prefix, index):
 # (We also use urandom below, which uses this under the hood.)
 system_random = random.SystemRandom()
 
+# The number of initial content testcases that were bundled for us, in the
+# "initial/" subdir.
+with open(INITIAL_CONTENT_NUM_PATH) as f:
+    num_initial_contents = int(f.read())
+
+
+def get_random_initial_content():
+    index = system_random.randint(0, num_initial_contents - 1)
+    return os.path.join(INITIAL_CONTENT_PATH, f'{index}.wasm')
+
+
 # In production ClusterFuzz we retry whenever we see a wasm-opt error. We are
 # not looking for wasm-opt issues there, and just use it to generate testcases
 # for VMs. For local testing, however, we may want to disable retrying, which
@@ -117,9 +135,19 @@ def get_wasm_contents(i, output_dir):
         with open(input_data_file_path, 'wb') as file:
             file.write(os.urandom(random_size))
 
-        # Generate wasm from the random data.
+        # Generate a command to use wasm-opt with the proper args to generate
+        # wasm content from the input data.
         cmd = [FUZZER_BINARY_PATH] + FUZZER_ARGS
         cmd += ['-o', wasm_file_path, input_data_file_path]
+
+        # Sometimes use a file from the initial content testcases.
+        if system_random.random() < 0.5:
+            initial_content = get_random_initial_content()
+            cmd += ['--initial-fuzz=' + initial_content]
+        else:
+            initial_content = None
+
+        # Generate wasm from the random data.
         try:
             subprocess.check_call(cmd)
         except subprocess.CalledProcessError:
@@ -148,7 +176,10 @@ def get_wasm_contents(i, output_dir):
 
     # Convert to a string, and wrap into a typed array.
     wasm_contents = ','.join([str(c) for c in wasm_contents])
-    return f'new Uint8Array([{wasm_contents}])'
+    js = f'new Uint8Array([{wasm_contents}])'
+    if initial_content:
+        js = f'{js} /* using initial content {os.path.basename(initial_content)} */'
+    return js
 
 
 # Returns the contents of a .js fuzz file, given the index of the testcase and

@@ -282,6 +282,15 @@ def test_file_contents(self):
         seen_calls = []
         seen_second_builds = []
         seen_JSPIs = []
+        seen_initial_contents = []
+
+        # Initial contents are noted in comments like this:
+        #
+        # /* using initial content 42.wasm */
+        #
+        # Note that we may see more than one in a file, as we may have more than
+        # one wasm in each testcase: each wasm has a chance.
+        initial_content_regex = re.compile(r'[/][*] using initial content ([^ ]+) [*][/]')
 
         for i in range(1, N + 1):
             fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
@@ -302,6 +311,8 @@ def test_file_contents(self):
                 assert '/* async */' in js
                 assert '/* await */' in js
 
+            seen_initial_contents.append(re.findall(initial_content_regex, js))
+
         # There is always one build and one call (those are in the default
         # fuzz_shell.js), and we add a couple of operations, each with equal
         # probability to be a build or a call, so over the 100 testcases here we
@@ -346,6 +357,55 @@ def test_file_contents(self):
 
         print()
 
+        # Flatten the data to help some of the below, from
+        #  [['a.wasm', 'b.wasm'], ['c.wasm']]
+        # into
+        #  ['a.wasm', 'b.wasm', 'c.wasm']
+        flat_initial_contents = [item for items in seen_initial_contents for item in items]
+
+        # Initial content appear 50% of the time for each wasm file. Each
+        # testcase has 1.333 wasm files on average.
+        print('Initial contents are distributed as ~ mean 0.68')
+        print(f'mean initial contents: {len(flat_initial_contents) / N}')
+        # Initial contents should be mostly unique (we have many, many testcases
+        # and we pick just 100 or so). And we must see more than one unique one.
+        unique_initial_contents = set(flat_initial_contents)
+        print(f'unique initial contents: {len(unique_initial_contents)} should be almost equal to {len(flat_initial_contents)}')
+        self.assertGreater(len(unique_initial_contents), 1)
+        # Not all testcases have initial contents.
+        num_initial_contents = [len(items) for items in seen_initial_contents]
+        self.assertEqual(min(num_initial_contents), 0)
+        # Some do (this is redundant given that the set of unique initial
+        # contents was asserted on before, so this just confirms/checks that).
+        self.assertGreaterEqual(max(num_initial_contents), 1)
+
+        print()
+
+        # Execute the files in V8. Almost all should execute properly (some
+        # small number may trap during startup, say on a segment out of bounds).
+        if shared.V8:
+            valid_executions = 0
+            for i in range(1, N + 1):
+                fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
+
+                cmd = [shared.V8, '--wasm-staging', fuzz_file]
+                proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+
+                # An execution is valid if we exited without error, and if we
+                # managed to run some code before exiting (modules with no
+                # exports will be considered "invalid" here, but that is very
+                # rare, and in a sense they are actually unuseful).
+                if proc.returncode == 0 and b'[fuzz-exec] calling ' in proc.stdout:
+                    valid_executions += 1
+
+            print('Valid executions are distributed as ~ mean 0.99')
+            print(f'mean valid executions: {valid_executions / N}')
+            # Assert on having at least half execute properly. Given the true mean
+            # is 0.9, for half of 100 to fail is incredibly unlikely.
+            self.assertGreater(valid_executions, N / 2)
+
+        print()
+
     # "zzz" in test name so that this runs last. If it runs first, it can be
     # confusing as it appears next to the logging of which bundle we use (see
     # setUpClass).