diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json new file mode 100644 index 0000000000000..b5882b53fcff2 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/cow-updated2-column-stats-index-table.json @@ -0,0 +1,13 @@ +{"c1_maxValue":101,"c1_minValue":101,"c1_nullCount":0,"c2_maxValue":" 999sdc","c2_minValue":" 999sdc","c2_nullCount":0,"c3_maxValue":10.329,"c3_minValue":10.329,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.179-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":99,"c5_minValue":99,"c5_nullCount":0,"c6_maxValue":"2020-03-28","c6_minValue":"2020-03-28","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"SA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":1} +{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":15} +{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":12} +{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":7} +{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9} +{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 985sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9} +{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":6} +{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8} +{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 987sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8} +{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 984sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-18T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13} +{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 989sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-02-25","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json new file mode 100644 index 0000000000000..0c048b5c5fd16 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/mor-updated2-column-stats-index-table.json @@ -0,0 +1,13 @@ +{"c1_maxValue":101,"c1_minValue":101,"c1_nullCount":0,"c2_maxValue":" 999sdc","c2_minValue":" 999sdc","c2_nullCount":0,"c3_maxValue":10.329,"c3_minValue":10.329,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.179-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":99,"c5_minValue":99,"c5_nullCount":0,"c6_maxValue":"2020-03-28","c6_minValue":"2020-03-28","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"SA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":1} +{"c1_maxValue":562,"c1_minValue":323,"c1_nullCount":0,"c2_maxValue":" 984sdc","c2_minValue":" 980sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.181-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":34,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"SA==","c7_minValue":"qw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":4} +{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_maxValue":" 8sdc","c2_minValue":" 111sdc","c2_nullCount":0,"c3_maxValue":979.272,"c3_minValue":82.111,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.193-08:00","c4_minValue":"2021-11-18T23:34:44.159-08:00","c4_nullCount":0,"c5_maxValue":58,"c5_minValue":2,"c5_nullCount":0,"c6_maxValue":"2020-11-08","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"9g==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":15} +{"c1_maxValue":619,"c1_minValue":619,"c1_nullCount":0,"c2_maxValue":" 985sdc","c2_minValue":" 985sdc","c2_nullCount":0,"c3_maxValue":230.320,"c3_minValue":230.320,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.180-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":33,"c5_minValue":33,"c5_nullCount":0,"c6_maxValue":"2020-02-13","c6_minValue":"2020-02-13","c6_nullCount":0,"c7_maxValue":"QA==","c7_minValue":"QA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":1} +{"c1_maxValue":633,"c1_minValue":624,"c1_nullCount":0,"c2_maxValue":" 987sdc","c2_minValue":" 986sdc","c2_nullCount":0,"c3_maxValue":580.317,"c3_minValue":375.308,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.180-08:00","c4_minValue":"2021-11-18T23:34:44.180-08:00","c4_nullCount":0,"c5_maxValue":33,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"PQ==","c7_minValue":"NA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":2} +{"c1_maxValue":639,"c1_minValue":555,"c1_nullCount":0,"c2_maxValue":" 989sdc","c2_minValue":" 982sdc","c2_nullCount":0,"c3_maxValue":904.304,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.186-08:00","c4_minValue":"2021-11-18T23:34:44.179-08:00","c4_nullCount":0,"c5_maxValue":44,"c5_minValue":31,"c5_nullCount":0,"c6_maxValue":"2020-08-25","c6_minValue":"2020-03-12","c6_nullCount":0,"c7_maxValue":"MA==","c7_minValue":"rw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":3} +{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_maxValue":" 76sdc","c2_minValue":" 224sdc","c2_nullCount":0,"c3_maxValue":958.579,"c3_minValue":246.427,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.199-08:00","c4_minValue":"2021-11-18T23:34:44.166-08:00","c4_nullCount":0,"c5_maxValue":73,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-16","c6_nullCount":0,"c7_maxValue":"+g==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":12} +{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_maxValue":" 768sdc","c2_minValue":" 118sdc","c2_nullCount":0,"c3_maxValue":959.131,"c3_minValue":64.768,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.164-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":7,"c5_nullCount":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-05-04","c6_nullCount":0,"c7_maxValue":"zw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":7} +{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.543-08:00","c4_minValue":"2021-11-19T20:40:55.521-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":32,"c5_nullCount":0,"c6_maxValue":"2020-11-14","c6_minValue":"2020-01-08","c6_nullCount":0,"c7_maxValue":"uQ==","c7_minValue":"AQ==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":9} +{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_maxValue":" 770sdc","c2_minValue":" 129sdc","c2_nullCount":0,"c3_maxValue":977.328,"c3_minValue":153.431,"c3_nullCount":0,"c4_maxValue":"2021-11-18T23:34:44.201-08:00","c4_minValue":"2021-11-18T23:34:44.169-08:00","c4_nullCount":0,"c5_maxValue":78,"c5_minValue":14,"c5_nullCount":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-15","c6_nullCount":0,"c7_maxValue":"rw==","c7_minValue":"Ag==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":6} +{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.339-08:00","c4_nullCount":0,"c5_maxValue":94,"c5_minValue":1,"c5_nullCount":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-01","c6_nullCount":0,"c7_maxValue":"xw==","c7_minValue":"AA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":8} +{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.549-08:00","c4_minValue":"2021-11-19T20:40:55.508-08:00","c4_nullCount":0,"c5_maxValue":95,"c5_minValue":10,"c5_nullCount":0,"c6_maxValue":"2020-10-10","c6_minValue":"2020-01-10","c6_nullCount":0,"c7_maxValue":"yA==","c7_minValue":"LA==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"c4_maxValue":"2021-11-19T20:40:55.550-08:00","c4_minValue":"2021-11-19T20:40:55.507-08:00","c4_nullCount":0,"c5_maxValue":97,"c5_minValue":9,"c5_nullCount":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-23","c6_nullCount":0,"c7_maxValue":"1Q==","c7_minValue":"Kw==","c7_nullCount":0,"c8_maxValue":9,"c8_minValue":9,"c8_nullCount":0,"valueCount":13} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..27b320cc21e43 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/update-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,11 @@ +{"c1":323,"c2":" 980sdc","c3":335.770,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-01-15","c7":"Ag==","c8":9} +{"c1":326,"c2":" 981sdc","c3":64.768,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-10-13","c7":"AA==","c8":9} +{"c1":555,"c2":" 982sdc","c3":153.431,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-03-12","c7":"rw==","c8":9} +{"c1":556,"c2":" 983sdc","c3":246.427,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-10-08","c7":"qw==","c8":9} +{"c1":562,"c2":" 984sdc","c3":977.328,"c4":"2021-11-18T23:34:44.181-08:00","c5":34,"c6":"2020-10-21","c7":"SA==","c8":9} +{"c1":619,"c2":" 985sdc","c3":230.320,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-02-13","c7":"QA==","c8":9} +{"c1":624,"c2":" 986sdc","c3":580.317,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-10-10","c7":"PQ==","c8":9} +{"c1":633,"c2":" 987sdc","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":32,"c6":"2020-01-01","c7":"NA==","c8":9} +{"c1":638,"c2":" 988sdc","c3":904.304,"c4":"2021-11-18T23:34:44.179-08:00","c5":32,"c6":"2020-08-25","c7":"MA==","c8":9} +{"c1":639,"c2":" 989sdc","c3":398.300,"c4":"2021-11-18T23:34:44.179-08:00","c5":31,"c6":"2020-04-21","c7":"LA==","c8":9} +{"c1":101,"c2":" 999sdc","c3":10.329,"c4":"2021-11-18T23:34:44.179-08:00","c5":99,"c6":"2020-03-28","c7":"SF==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index 822d2051cb024..2c17bb8cddefd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -24,6 +24,7 @@ import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} @@ -31,12 +32,12 @@ import org.apache.hudi.functional.TestColumnStatsIndex.ColumnStatsTestCase import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} import org.apache.spark.sql._ -import org.apache.spark.sql.functions.{col, lit, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.{assertEquals, assertNotNull, assertTrue} import org.junit.jupiter.api._ import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.{Arguments, ArgumentsSource, MethodSource, ValueSource} +import org.junit.jupiter.params.provider.{Arguments, MethodSource, ValueSource} import java.math.BigInteger import java.sql.{Date, Timestamp} @@ -84,10 +85,11 @@ class TestColumnStatsIndex extends HoodieClientTestBase { HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" ) - val opts = Map( + val commonOpts = Map( "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + DataSourceWriteOptions.TABLE_TYPE.key -> testCase.tableType.toString, RECORDKEY_FIELD.key -> "c1", PRECOMBINE_FIELD.key -> "c1", // NOTE: Currently only this setting is used like following by different MT partitions: @@ -97,88 +99,28 @@ class TestColumnStatsIndex extends HoodieClientTestBase { HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" ) ++ metadataOpts - val sourceJSONTablePath = getClass.getClassLoader.getResource("index/colstats/input-table-json").toString - - // NOTE: Schema here is provided for validation that the input date is in the appropriate format - val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath) - - inputDF - .sort("c1") - .repartition(4, new Column("c1")) - .write - .format("hudi") - .options(opts) - .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) - .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) - .mode(SaveMode.Overwrite) - .save(basePath) - - metaClient = HoodieTableMetaClient.reload(metaClient) - - val metadataConfig = HoodieMetadataConfig.newBuilder() - .fromProperties(toProperties(metadataOpts)) - .build() - - val requestedColumns: Seq[String] = sourceTableSchema.fieldNames - - val columnStatsIndex = new ColumnStatsIndexSupport(spark, sourceTableSchema, metadataConfig, metaClient) - - val expectedColStatsSchema = composeIndexSchema(sourceTableSchema.fieldNames, sourceTableSchema) - - columnStatsIndex.loadTransposed(requestedColumns, testCase.shouldReadInMemory) { transposedColStatsDF => - // Match against expected column stats table - val expectedColStatsIndexTableDf = - spark.read - .schema(expectedColStatsSchema) - .json(getClass.getClassLoader.getResource("index/colstats/column-stats-index-table.json").toString) - - assertEquals(expectedColStatsIndexTableDf.schema, transposedColStatsDF.schema) - // NOTE: We have to drop the `fileName` column as it contains semi-random components - // that we can't control in this test. Nevertheless, since we manually verify composition of the - // ColStats Index by reading Parquet footers from individual Parquet files, this is not an issue - assertEquals(asJson(sort(expectedColStatsIndexTableDf)), asJson(sort(transposedColStatsDF.drop("fileName")))) - - // Collect Column Stats manually (reading individual Parquet files) - val manualColStatsTableDF = - buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, sourceTableSchema.fieldNames, expectedColStatsSchema) - - assertEquals(asJson(sort(manualColStatsTableDF)), asJson(sort(transposedColStatsDF))) - } - - // do an upsert and validate - val updateJSONTablePath = getClass.getClassLoader.getResource("index/colstats/another-input-table-json").toString - val updateDF = spark.read - .schema(sourceTableSchema) - .json(updateJSONTablePath) - - updateDF.repartition(4) - .write - .format("hudi") - .options(opts) - .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) - .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) - .mode(SaveMode.Append) - .save(basePath) - - metaClient = HoodieTableMetaClient.reload(metaClient) - - val updatedColumnStatsIndex = new ColumnStatsIndexSupport(spark, sourceTableSchema, metadataConfig, metaClient) - - updatedColumnStatsIndex.loadTransposed(requestedColumns, testCase.shouldReadInMemory) { transposedUpdatedColStatsDF => - val expectedColStatsIndexUpdatedDF = - spark.read - .schema(expectedColStatsSchema) - .json(getClass.getClassLoader.getResource("index/colstats/updated-column-stats-index-table.json").toString) - - assertEquals(expectedColStatsIndexUpdatedDF.schema, transposedUpdatedColStatsDF.schema) - assertEquals(asJson(sort(expectedColStatsIndexUpdatedDF)), asJson(sort(transposedUpdatedColStatsDF.drop("fileName")))) - - // Collect Column Stats manually (reading individual Parquet files) - val manualUpdatedColStatsTableDF = - buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, sourceTableSchema.fieldNames, expectedColStatsSchema) - - assertEquals(asJson(sort(manualUpdatedColStatsTableDF)), asJson(sort(transposedUpdatedColStatsDF))) + doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/input-table-json", + expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json", + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Overwrite) + + doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/another-input-table-json", + expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json", + operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append) + + val expectedColStatsSourcePath = if (testCase.tableType == HoodieTableType.COPY_ON_WRITE) { + "index/colstats/cow-updated2-column-stats-index-table.json" + } else { + "index/colstats/mor-updated2-column-stats-index-table.json" } + doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, + dataSourcePath = "index/colstats/update-input-table-json", + expectedColStatsSourcePath = expectedColStatsSourcePath, + operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, + saveMode = SaveMode.Append) } @ParameterizedTest @@ -365,6 +307,42 @@ class TestColumnStatsIndex extends HoodieClientTestBase { }) } + private def doWriteAndValidateColumnStats(testCase: ColumnStatsTestCase, + metadataOpts: Map[String, String], + hudiOpts: Map[String, String], + dataSourcePath: String, + expectedColStatsSourcePath: String, + operation: String, + saveMode: SaveMode): Unit = { + val sourceJSONTablePath = getClass.getClassLoader.getResource(dataSourcePath).toString + + // NOTE: Schema here is provided for validation that the input date is in the appropriate format + val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath) + + inputDF + .sort("c1") + .repartition(4, new Column("c1")) + .write + .format("hudi") + .options(hudiOpts) + .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) + .option(DataSourceWriteOptions.OPERATION.key, operation) + .mode(saveMode) + .save(basePath) + + metaClient = HoodieTableMetaClient.reload(metaClient) + + // Only parquet files are supported for the validation against the generated column stats, + // constructing the column stats from parquet data files using Spark SQL and comparing that + // with column stats index. This means that the following operations are support for such + // validation: (1) COW: all operations; (2) MOR: insert only. + val validateColumnStatsAgainstDataFiles = + (testCase.tableType == HoodieTableType.COPY_ON_WRITE + || operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)) + validateColumnStatsIndex( + testCase, metadataOpts, expectedColStatsSourcePath, validateColumnStatsAgainstDataFiles) + } + private def buildColumnStatsTableManually(tablePath: String, includedCols: Seq[String], indexedCols: Seq[String], @@ -411,6 +389,45 @@ class TestColumnStatsIndex extends HoodieClientTestBase { ) } + private def validateColumnStatsIndex(testCase: ColumnStatsTestCase, + metadataOpts: Map[String, String], + expectedColStatsSourcePath: String, + validateColumnStatsAgainstDataFiles: Boolean): Unit = { + val metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(toProperties(metadataOpts)) + .build() + + val columnStatsIndex = new ColumnStatsIndexSupport(spark, sourceTableSchema, metadataConfig, metaClient) + + val expectedColStatsSchema = composeIndexSchema(sourceTableSchema.fieldNames, sourceTableSchema) + val validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", "c2_minValue") + + columnStatsIndex.loadTransposed(sourceTableSchema.fieldNames, testCase.shouldReadInMemory) { transposedColStatsDF => + // Match against expected column stats table + val expectedColStatsIndexTableDf = + spark.read + .schema(expectedColStatsSchema) + .json(getClass.getClassLoader.getResource(expectedColStatsSourcePath).toString) + + assertEquals(expectedColStatsIndexTableDf.schema, transposedColStatsDF.schema) + // NOTE: We have to drop the `fileName` column as it contains semi-random components + // that we can't control in this test. Nevertheless, since we manually verify composition of the + // ColStats Index by reading Parquet footers from individual Parquet files, this is not an issue + assertEquals(asJson(sort(expectedColStatsIndexTableDf, validationSortColumns)), + asJson(sort(transposedColStatsDF.drop("fileName"), validationSortColumns))) + + if (validateColumnStatsAgainstDataFiles) { + // TODO(HUDI-4557): support validation of column stats of avro log files + // Collect Column Stats manually (reading individual Parquet files) + val manualColStatsTableDF = + buildColumnStatsTableManually(basePath, sourceTableSchema.fieldNames, sourceTableSchema.fieldNames, expectedColStatsSchema) + + assertEquals(asJson(sort(manualColStatsTableDF, validationSortColumns)), + asJson(sort(transposedColStatsDF, validationSortColumns))) + } + } + } + private def generateRandomDataFrame(spark: SparkSession): DataFrame = { val sourceTableSchema = new StructType() @@ -419,9 +436,9 @@ class TestColumnStatsIndex extends HoodieClientTestBase { // NOTE: We're testing different values for precision of the decimal to make sure // we execute paths bearing different underlying representations in Parquet // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL - .add("c3a", DecimalType(9,3)) - .add("c3b", DecimalType(10,3)) - .add("c3c", DecimalType(20,3)) + .add("c3a", DecimalType(9, 3)) + .add("c3b", DecimalType(10, 3)) + .add("c3c", DecimalType(20, 3)) .add("c4", TimestampType) .add("c5", ShortType) .add("c6", DateType) @@ -456,24 +473,28 @@ class TestColumnStatsIndex extends HoodieClientTestBase { .mkString("\n") private def sort(df: DataFrame): DataFrame = { + sort(df, Seq("c1_maxValue", "c1_minValue")) + } + + private def sort(df: DataFrame, sortColumns: Seq[String]): DataFrame = { val sortedCols = df.columns.sorted - // Sort dataset by the first 2 columns (to minimize non-determinism in case multiple files have the same + // Sort dataset by specified columns (to minimize non-determinism in case multiple files have the same // value of the first column) df.select(sortedCols.head, sortedCols.tail: _*) - .sort("c1_maxValue", "c1_minValue") + .sort(sortColumns.head, sortColumns.tail: _*) } - } object TestColumnStatsIndex { - case class ColumnStatsTestCase(forceFullLogScan: Boolean, shouldReadInMemory: Boolean) + case class ColumnStatsTestCase(tableType: HoodieTableType, forceFullLogScan: Boolean, shouldReadInMemory: Boolean) - def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = - java.util.stream.Stream.of( - Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = false, shouldReadInMemory = true)), - Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = false, shouldReadInMemory = false)), - Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = true, shouldReadInMemory = false)), - Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = true, shouldReadInMemory = true)) - ) + def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = { + java.util.stream.Stream.of(HoodieTableType.values().toStream.flatMap(tableType => + Seq(Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = false, shouldReadInMemory = true)), + Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = false, shouldReadInMemory = false)), + Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = true, shouldReadInMemory = false)), + Arguments.arguments(ColumnStatsTestCase(tableType, forceFullLogScan = true, shouldReadInMemory = true))) + ): _*) + } }