spark-bench = { 2 spark-submit-config = [{ 3 spark-args = { 4 master = "yarn" // FILL IN YOUR MASTER HERE 5 executor-memory = "2G" // FILL IN YOUR EXECUTOR MEMORY 6 deploy-mode = "cluster" 7 principal :"XXXXXXX@XXX.COM" 8 keytab : "/etc/hdp/keytabs/spark.headless.keytab" 9 } 10 conf = { 11 // Any configuration you need for your setup goes here, like: 12 // "spark.dynamicAllocation.enabled" = "false" 13 } 14 suites-parallel = false 15 workload-suites = [ 16 { 17 descr = "Generate a dataset, then take that same dataset" 18 benchmark-output = "hdfs:///tmp/csv-vs-parquet-hdfs/results-data-gen.csv" 19 // We need to generate the dataset first through the data generator, then we take that dataset and convert it to Parquet. 20 parallel = false 21 workloads = [ 22 { 23 name = "data-generation-kmeans" 24 rows = 10000000 25 cols = 24 26 output = "hdfs:///tmp/csv-vs-parquet-hdfs/kmeans-data.csv" 27 }, 28 { 29 name = "sql" 30 query = "select * from input" 31 input = "hdfs:///tmp/csv-vs-parquet-hdfs/kmeans-data.csv" 32 output = "hdfs:///tmp/csv-vs-parquet-hdfs/kmeans-data.parquet" 33 } 34 35 ] 36 }, 37 { 38 descr = "Run SQL querie over the dataset " 39 benchmark-output = "hdfs:///tmp/csv-vs-parquet-hdfs/results-sql.csv" 40 parallel = false 41 repeat = 1 42 workloads = [ 43 { 44 name = "sql" 45 input = ["hdfs:///tmp/csv-vs-parquet-hdfs/kmeans-data.csv","hdfs:///tmp/csv-vs-parquet-hdfs/kmeans-data.parquet"] 46 query = ["select * from input", "select `0`, `22` from input where `0` < -0.9"] 47 cache = false 48 } 49 ] 50 } 51 ] 52 }] 53 }