Merge remote-tracking branch 'origin/main' into no_api_refresh_for_un…

…promotables
Tim-Brooks · Jun 13, 2023 · 1c713c7 · 1c713c7
2 parents 7d6ebe0 + cba3e27
commit 1c713c7
Show file tree

Hide file tree

Showing 190 changed files with 10,972 additions and 1,238 deletions.
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/tdigest/SortBench.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/tdigest/SortBench.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * This project is based on a modification of https://github.com/tdunning/t-digest which is licensed under the Apache 2.0 License.
+ */
+
+package org.elasticsearch.benchmark.tdigest;
+
+import org.elasticsearch.tdigest.Sort;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+/** Explores the performance of Sort on pathological input data. */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 10, time = 3, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 20, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@Threads(1)
+@State(Scope.Thread)
+public class SortBench {
+    private final int size = 100000;
+    private final double[] values = new double[size];
+
+    @Param({ "0", "1", "-1" })
+    public int sortDirection;
+
+    @Setup
+    public void setup() {
+        Random prng = new Random(999983);
+        for (int i = 0; i < size; i++) {
+            values[i] = prng.nextDouble();
+        }
+        if (sortDirection > 0) {
+            Arrays.sort(values);
+        } else if (sortDirection < 0) {
+            Arrays.sort(values);
+            Sort.reverse(values, 0, values.length);
+        }
+    }
+
+    @Benchmark
+    public void quicksort() {
+        int[] order = new int[size];
+        for (int i = 0; i < size; i++) {
+            order[i] = i;
+        }
+        Sort.sort(order, values, null, values.length);
+    }
+}
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/tdigest/TDigestBench.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/tdigest/TDigestBench.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * This project is based on a modification of https://github.com/tdunning/t-digest which is licensed under the Apache 2.0 License.
+ */
+
+package org.elasticsearch.benchmark.tdigest;
+
+import org.elasticsearch.tdigest.AVLTreeDigest;
+import org.elasticsearch.tdigest.MergingDigest;
+import org.elasticsearch.tdigest.TDigest;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.profile.GCProfiler;
+import org.openjdk.jmh.profile.StackProfiler;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Supplier;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@Threads(1)
+@State(Scope.Thread)
+public class TDigestBench {
+
+    public enum TDigestFactory {
+        MERGE {
+            @Override
+            TDigest create(double compression) {
+                return new MergingDigest(compression, (int) (10 * compression));
+            }
+        },
+        AVL_TREE {
+            @Override
+            TDigest create(double compression) {
+                return new AVLTreeDigest(compression);
+            }
+        };
+
+        abstract TDigest create(double compression);
+    }
+
+    @Param({ "100", "300" })
+    double compression;
+
+    @Param({ "MERGE", "AVL_TREE" })
+    TDigestFactory tdigestFactory;
+
+    @Param({ "NORMAL", "GAUSSIAN" })
+    String distribution;
+
+    Random random;
+    TDigest tdigest;
+
+    double[] data = new double[1000000];
+
+    @Setup
+    public void setUp() {
+        random = ThreadLocalRandom.current();
+        tdigest = tdigestFactory.create(compression);
+
+        Supplier<Double> nextRandom = () -> distribution.equals("GAUSSIAN") ? random.nextGaussian() : random.nextDouble();
+        for (int i = 0; i < 10000; ++i) {
+            tdigest.add(nextRandom.get());
+        }
+
+        for (int i = 0; i < data.length; ++i) {
+            data[i] = nextRandom.get();
+        }
+    }
+
+    @State(Scope.Thread)
+    public static class ThreadState {
+        int index = 0;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.AverageTime)
+    @OutputTimeUnit(TimeUnit.MICROSECONDS)
+    public void add(ThreadState state) {
+        if (state.index >= data.length) {
+            state.index = 0;
+        }
+        tdigest.add(data[state.index++]);
+    }
+
+    public static void main(String[] args) throws RunnerException {
+        Options opt = new OptionsBuilder().include(".*" + TDigestBench.class.getSimpleName() + ".*")
+            .warmupIterations(5)
+            .measurementIterations(5)
+            .addProfiler(GCProfiler.class)
+            .addProfiler(StackProfiler.class)
+            .build();
+
+        new Runner(opt).run();
+    }
+}
diff --git a/...n/java/org/elasticsearch/gradle/internal/InternalDistributionModuleCheckTaskProvider.java b/...n/java/org/elasticsearch/gradle/internal/InternalDistributionModuleCheckTaskProvider.java
@@ -61,6 +61,7 @@ public class InternalDistributionModuleCheckTaskProvider {
         "org.elasticsearch.preallocate",
         "org.elasticsearch.securesm",
         "org.elasticsearch.server",
+        "org.elasticsearch.tdigest",
         "org.elasticsearch.xcontent"
     );
 
@@ -75,7 +76,7 @@ public class InternalDistributionModuleCheckTaskProvider {
 
     private static final Function<ModuleReference, String> toName = mref -> mref.descriptor().name();
 
-    private InternalDistributionModuleCheckTaskProvider() {};
+    private InternalDistributionModuleCheckTaskProvider() {}
 
     /** Registers the checkModules tasks, which contains all checks relevant to ES Java Modules. */
     static TaskProvider<Task> registerCheckModulesTask(Project project, TaskProvider<Copy> checkExtraction) {

diff --git a/docs/changelog/94089.yaml b/docs/changelog/94089.yaml
@@ -0,0 +1,5 @@
+pr: 94089
+summary: Add support for `xlm_roberta` tokenized models
+area: Machine Learning
+type: feature
+issues: []
diff --git a/docs/changelog/96716.yaml b/docs/changelog/96716.yaml
@@ -0,0 +1,5 @@
+pr: 96716
+summary: Feature/speed up binary vector decoding
+area: Search
+type: enhancement
+issues: []
diff --git a/docs/changelog/96777.yaml b/docs/changelog/96777.yaml
@@ -0,0 +1,7 @@
+pr: 96777
+summary: Fixing `GeoIpDownloaderStatsAction$NodeResponse` serialization by defensively
+  copying inputs
+area: Ingest Node
+type: bug
+issues:
+ - 96438
diff --git a/docs/changelog/96790.yaml b/docs/changelog/96790.yaml
@@ -0,0 +1,5 @@
+pr: 96790
+summary: "[Profiling] Require POST to retrieve stacktraces"
+area: Application
+type: enhancement
+issues: []
diff --git a/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc b/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc
@@ -53,16 +53,16 @@ The response will look like this:
  "aggregations": {
     "load_time_ranks": {
       "values": {
-        "500.0": 90.01,
-        "600.0": 100.0
+        "500.0": 55.0,
+        "600.0": 64.0
       }
     }
   }
 }
 --------------------------------------------------
 // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
-// TESTRESPONSE[s/"500.0": 90.01/"500.0": 55.00000000000001/]
-// TESTRESPONSE[s/"600.0": 100.0/"600.0": 64.0/]
+// TESTRESPONSE[s/"500.0": 55.0/"500.0": 55.00000000000001/]
+// TESTRESPONSE[s/"600.0": 64.0/"600.0": 64.0/]
 
 From this information you can determine you are hitting the 99% load time target but not quite
 hitting the 95% load time target
@@ -101,20 +101,20 @@ Response:
       "values": [
         {
           "key": 500.0,
-          "value": 90.01
+          "value": 55.0
         },
         {
           "key": 600.0,
-          "value": 100.0
+          "value": 64.0
         }
       ]
     }
   }
 }
 --------------------------------------------------
 // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
-// TESTRESPONSE[s/"value": 90.01/"value": 55.00000000000001/]
-// TESTRESPONSE[s/"value": 100.0/"value": 64.0/]
+// TESTRESPONSE[s/"value": 55.0/"value": 55.00000000000001/]
+// TESTRESPONSE[s/"value": 64.0/"value": 64.0/]
 
 
 ==== Script

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
@@ -944,6 +944,7 @@ values are
 * `bert`: Use for BERT-style models
 * `mpnet`: Use for MPNet-style models
 * `roberta`: Use for RoBERTa-style and BART-style models
+* `xlm_roberta`: Use for XLMRoBERTa-style models
 --
 end::inference-config-nlp-tokenization[]
 
@@ -1026,6 +1027,10 @@ Tokenize with special tokens. The tokens typically included in MPNet-style token
 --
 end::inference-config-nlp-tokenization-mpnet-with-special-tokens[]
 
+tag::inference-config-nlp-tokenization-xlm-roberta[]
+experimental:[] XLMRoBERTa-style tokenization is to be performed with the enclosed settings.
+end::inference-config-nlp-tokenization-xlm-roberta[]
+
 tag::inference-config-nlp-vocabulary[]
 The configuration for retrieving the vocabulary of the model. The vocabulary is
 then used at inference time. This information is usually provided automatically