In [1]:
#open spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("A2-Part2-Pipeline").getOrCreate()

#load reviews and stopwords
DEBUG = True
if DEBUG:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
else:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"
stopwordsPath = "Exercise_1/assets/stopwords.txt"
#define structure of json for faster reading
from pyspark.sql import types as T
review_schema = T.StructType([
     T.StructField("reviewerID",      T.StringType(),  True),
     T.StructField("asin",            T.StringType(),  True),
     T.StructField("reviewerName",    T.StringType(),  True),
     T.StructField("helpful",         T.ArrayType(T.IntegerType()), True),
     T.StructField("reviewText",      T.StringType(),  True),
     T.StructField("overall",         T.FloatType(),   True),
     T.StructField("summary",         T.StringType(),  True),
     T.StructField("unixReviewTime",  T.LongType(),    True),
     T.StructField("reviewTime",      T.StringType(),  True),
     T.StructField("category",        T.StringType(),  True),
 ])
#read and select category and review
df = (
    spark.read
         .schema(review_schema)
         .json(RAW_PATH)
         .selectExpr("reviewText AS text",
                     "category AS label")
    .na.drop(subset=["text", "label"])
)

# reading the stopwords
stopwords = spark.sparkContext.textFile(stopwordsPath).collect()


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
25/05/10 02:13:57 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
25/05/10 02:13:59 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


                                                                                

In [5]:
# ─────────────────────────────────────────────────────────────────────────────
#  Part 3 – Multiclass Text Classification with Linear-SVM  (PySpark)
# ─────────────────────────────────────────────────────────────────────────────
#
#  Prerequisite: the DataFrame `df` from Part 2
#                columns:  text(string) , label(string|int)
#  Spark ≥ 3.2.0 recommended
# ─────────────────────────────────────────────────────────────────────────────

from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, CountVectorizer, IDF,
    Normalizer, ChiSqSelector, StringIndexer
)
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.sql import functions as F

# ─────────────────────────────────────────────────────────────────────────────
#  1.  Reproducible split: 80 % trainVal / 20 % test
#      … and inside the CV step below: another 80 % / 20 % split to get
#      train vs. *validation* (→ effective 64 % train · 16 % val · 20 % test)
# ─────────────────────────────────────────────────────────────────────────────
SEED = 567
train_val_df, test_df = df.randomSplit([0.8, 0.2], seed=SEED)
print(f"Train+Val: {train_val_df.count():,}  ·  Test: {test_df.count():,}")

# ─────────────────────────────────────────────────────────────────────────────
#  2.  String-labels → numeric (required for SVM)
# ─────────────────────────────────────────────────────────────────────────────
label_indexer = StringIndexer(inputCol="label", outputCol="label_idx").fit(df)

# ─────────────────────────────────────────────────────────────────────────────
#  3.  Shared preprocessing stages     (exactly as in Part 2)
# ─────────────────────────────────────────────────────────────────────────────
tokenizer = RegexTokenizer(
    inputCol="text", outputCol="tokens",
    pattern=r"""[ \t0-9()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",
    gaps=True, toLowercase=True)

stopper = StopWordsRemover(inputCol="tokens", outputCol="clean_tokens")

cv = CountVectorizer(
    inputCol="clean_tokens", outputCol="tf",
    minDF=2, vocabSize=50_000)        # keep generous ceiling

idf = IDF(inputCol="tf", outputCol="tfidf")

#  Normalise TF-IDF vectors
normaliser = Normalizer(inputCol="tfidf", outputCol="features", p=2.0)

#  Two χ² selectors (2 000 vs. 500 features)
selector2k = ChiSqSelector(
    numTopFeatures=2_000,
    featuresCol="features", outputCol="chi2_features",
    labelCol="label_idx")

selector500 = selector2k.copy({selector2k.numTopFeatures: 500})

#  SVM wrapped for multi-class
svm = LinearSVC(featuresCol="chi2_features",
                labelCol="label_idx",
                predictionCol="prediction")

ovr = OneVsRest(classifier=svm,
                labelCol="label_idx",
                featuresCol="chi2_features",
                predictionCol="prediction")

# ─────────────────────────────────────────────────────────────────────────────
#  4.  Pipeline (place-holders; grid will swap in selector2k vs. selector500)
# ─────────────────────────────────────────────────────────────────────────────
pipe = Pipeline(stages=[
    label_indexer,         # [0]
    tokenizer, stopper, cv, idf, normaliser,
    selector2k,            # [6] will be overridden by grid
    ovr                    # [7]
])

# ─────────────────────────────────────────────────────────────────────────────
#  5.  Param grid
#      • χ²: 2 000  vs  500
#      • regParam:  0.01, 0.1, 1.0
#      • standardization:  True / False
#      • maxIter:  50 / 200
# ─────────────────────────────────────────────────────────────────────────────
param_grid = (
    ParamGridBuilder()
    # swap χ²-selector stage
    #.addGrid(pipe.stages[6], selector2k)
    # SVM hyper-params (access the underlying LinearSVC inside the OvR)
    .addGrid(svm.regParam,        [0.01, 0.1, 1.0])
    .addGrid(svm.standardization, [True, False])
    .addGrid(svm.maxIter,         [50, 200])
    .build()
)

print(f"Grid size: {len(param_grid):,} parameter combos")

# ─────────────────────────────────────────────────────────────────────────────
#  6.  Train-validation split inside **TrainValidationSplit**
#      (20 % of trainVal_df is held out for validation)
# ─────────────────────────────────────────────────────────────────────────────
evaluator = MulticlassClassificationEvaluator(
    labelCol="label_idx",
    predictionCol="prediction",
    metricName="f1")

tvs = TrainValidationSplit(
    estimator=pipe,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=0.8,           # 80 % train, 20 % val (of trainVal_df)
    seed=SEED,
    parallelism=4)            # adjust to cluster resources

# ─────────────────────────────────────────────────────────────────────────────
#  7.  Fit grid & choose best model
# ─────────────────────────────────────────────────────────────────────────────
tvs_model = tvs.fit(train_val_df)
best_model = tvs_model.bestModel
best_params = {k.name: v for k, v in best_model.stages[-1]  # OvR → LinearSVCModel
                                       .getClassifier()
                                       .extractParamMap().items()}
print("Best params:", best_params)

# ─────────────────────────────────────────────────────────────────────────────
#  8.  Final evaluation on the untouched test set
# ─────────────────────────────────────────────────────────────────────────────
pred_test = best_model.transform(test_df)
f1_test = evaluator.evaluate(pred_test)
print(f"Test F1 (macro): {f1_test:.4f}")


Train+Val: 63,040  ·  Test: 15,789
Grid size: 12 parameter combos


                                                                                

25/05/10 02:25:09 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:25:14 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:25:20 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:25:20 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:25:25 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
25/05/10 02:25:25 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
25/05/10 02:25:25 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:25:34 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
25/05/10 02:25:34 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


                                                                                

25/05/10 02:25:39 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


                                                                                

25/05/10 02:25:48 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


                                                                                

25/05/10 02:25:58 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


[Stage 61:>   (0 + 1) / 2][Stage 63:>   (0 + 0) / 2][Stage 64:==> (1 + 1) / 2]

25/05/10 02:26:09 WARN TaskSetManager: Lost task 0.0 in stage 61.0 (TID 86) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasN

[Stage 61:>   (0 + 1) / 2][Stage 63:>   (0 + 1) / 2][Stage 65:>   (0 + 0) / 2]

25/05/10 02:26:21 WARN TaskSetManager: Lost task 0.0 in stage 63.0 (TID 93) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasN

[Stage 61:>   (0 + 2) / 2][Stage 63:>   (0 + 0) / 2][Stage 65:>   (0 + 0) / 2]

25/05/10 02:26:21 WARN TaskSetManager: Lost task 0.3 in stage 61.0 (TID 94) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasN

[Stage 63:>   (0 + 1) / 2][Stage 65:>   (0 + 1) / 2][Stage 66:>   (0 + 0) / 2]

Py4JJavaError: An error occurred while calling o1024.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 61.0 failed 4 times, most recent failure: Lost task 0.3 in stage 61.0 (TID 94) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:32)
	at org.sparkproject.guava.collect.TopKSelector.offerAll(TopKSelector.java:248)
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:785)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1539)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2668)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2604)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2603)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2603)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1178)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1178)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1178)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2798)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2787)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2333)
	at org.apache.spark.rdd.RDD.$anonfun$reduce$1(RDD.scala:1111)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1093)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$1(RDD.scala:1545)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1533)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:204)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3908)
	at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:3160)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3898)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3896)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3896)
	at org.apache.spark.sql.Dataset.collect(Dataset.scala:3160)
	at org.apache.spark.ml.feature.Selector.getTopIndices$1(Selector.scala:216)
	at org.apache.spark.ml.feature.Selector.fit(Selector.scala:222)
	at org.apache.spark.ml.feature.ChiSqSelector.fit(ChiSqSelector.scala:111)
	at org.apache.spark.ml.feature.ChiSqSelector.fit(ChiSqSelector.scala:49)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:32)
	at org.sparkproject.guava.collect.TopKSelector.offerAll(TopKSelector.java:248)
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:785)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1539)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


25/05/10 02:26:24 WARN TaskSetManager: Lost task 0.0 in stage 65.0 (TID 97) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasN

[Stage 63:>   (0 + 2) / 2][Stage 65:>   (0 + 0) / 2][Stage 66:>   (0 + 0) / 2]

25/05/10 02:26:25 WARN TaskSetManager: Lost task 1.0 in stage 63.0 (TID 98) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasN

[Stage 63:>   (0 + 1) / 2][Stage 65:>   (0 + 1) / 2][Stage 66:>   (0 + 0) / 2]

25/05/10 02:26:31 ERROR TaskSetManager: Task 0 in stage 63.0 failed 4 times; aborting job
25/05/10 02:26:31 WARN TaskSetManager: Lost task 1.2 in stage 63.0 (TID 104) (lbdwo17.datalab.novalocal executor 1): TaskKilled (Stage cancelled)


[Stage 65:>   (0 + 2) / 2][Stage 66:>   (0 + 0) / 2][Stage 68:>   (0 + 0) / 2]

25/05/10 02:26:32 WARN TaskSetManager: Lost task 1.0 in stage 65.0 (TID 105) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 65:>   (0 + 1) / 2][Stage 66:>   (0 + 1) / 2][Stage 68:>   (0 + 0) / 2]

25/05/10 02:26:35 WARN TaskSetManager: Lost task 0.3 in stage 65.0 (TID 107) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 67:>   (0 + 2) / 2][Stage 70:==> (1 + 0) / 2][Stage 72:>   (0 + 0) / 2]

25/05/10 02:26:51 WARN TaskSetManager: Lost task 0.0 in stage 67.0 (TID 113) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 67:>   (0 + 2) / 2][Stage 69:==> (1 + 0) / 2][Stage 70:==> (1 + 0) / 2]

25/05/10 02:26:52 WARN TaskSetManager: Lost task 1.0 in stage 67.0 (TID 114) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 67:>   (0 + 2) / 2][Stage 71:==> (1 + 0) / 2][Stage 72:>   (0 + 0) / 2]  

25/05/10 02:27:00 ERROR TaskSetManager: Task 1 in stage 67.0 failed 4 times; aborting job
25/05/10 02:27:00 WARN TaskSetManager: Lost task 0.3 in stage 67.0 (TID 124) (lbdwo15.datalab.novalocal executor 2): TaskKilled (Stage cancelled)


                                                                                

25/05/10 02:27:18 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:27:24 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:27:27 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:27:29 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:27:32 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:27:34 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:27:37 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:27:42 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB
25/05/10 02:27:42 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:27:50 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


                                                                                

25/05/10 02:27:56 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


                                                                                

25/05/10 02:28:06 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


[Stage 99:>   (0 + 1) / 2][Stage 102:=> (1 + 1) / 2][Stage 104:>  (0 + 0) / 2]

25/05/10 02:28:08 WARN TaskSetManager: Lost task 0.0 in stage 99.0 (TID 171) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 99:>   (0 + 1) / 2][Stage 103:>  (0 + 0) / 2][Stage 104:>  (0 + 1) / 2]

25/05/10 02:28:22 WARN TaskSetManager: Lost task 0.3 in stage 99.0 (TID 181) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.has

[Stage 103:>  (0 + 2) / 2][Stage 104:=> (1 + 0) / 2][Stage 106:>  (0 + 0) / 2]

25/05/10 02:28:24 WARN TaskSetManager: Lost task 1.0 in stage 103.0 (TID 184) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 103:>  (0 + 1) / 2][Stage 104:=> (1 + 1) / 2][Stage 106:>  (0 + 0) / 2]

25/05/10 02:28:25 WARN TaskSetManager: Lost task 0.0 in stage 103.0 (TID 183) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 106:>                (0 + 1) / 2][Stage 108:>                (0 + 0) / 2]

25/05/10 02:28:35 WARN TaskSetManager: Lost task 1.3 in stage 103.0 (TID 190) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 105:>  (0 + 1) / 2][Stage 106:>  (0 + 1) / 2][Stage 108:>  (0 + 0) / 2]

25/05/10 02:28:37 WARN TaskSetManager: Lost task 0.0 in stage 105.0 (TID 193) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 105:>  (0 + 1) / 2][Stage 106:=> (1 + 1) / 2][Stage 108:>  (0 + 0) / 2]

25/05/10 02:28:49 WARN TaskSetManager: Lost task 0.3 in stage 105.0 (TID 200) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 107:>  (0 + 1) / 2][Stage 108:=> (1 + 1) / 2][Stage 110:>  (0 + 0) / 2]

25/05/10 02:28:58 WARN TaskSetManager: Lost task 0.0 in stage 107.0 (TID 204) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 107:>  (0 + 2) / 2][Stage 109:>  (0 + 0) / 2][Stage 110:>  (0 + 0) / 2]

25/05/10 02:28:59 WARN TaskSetManager: Lost task 1.0 in stage 107.0 (TID 205) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 107:>  (0 + 2) / 2][Stage 110:=> (1 + 0) / 2][Stage 112:>  (0 + 0) / 2]  

25/05/10 02:29:08 WARN TaskSetManager: Lost task 0.3 in stage 107.0 (TID 213) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

                                                                                

25/05/10 02:29:26 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:29:30 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:29:35 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:29:36 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
25/05/10 02:29:36 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:29:41 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB


                                                                                

25/05/10 02:29:48 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB
25/05/10 02:29:48 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
25/05/10 02:29:48 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB


                                                                                

25/05/10 02:29:56 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


[Stage 137:>  (0 + 2) / 2][Stage 140:=> (1 + 0) / 2][Stage 142:>  (0 + 0) / 2]

25/05/10 02:30:07 WARN TaskSetManager: Lost task 0.0 in stage 137.0 (TID 259) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 137:>  (0 + 2) / 2][Stage 139:=> (1 + 0) / 2][Stage 140:=> (1 + 0) / 2]

25/05/10 02:30:08 WARN TaskSetManager: Lost task 1.0 in stage 137.0 (TID 260) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 137:>  (0 + 2) / 2][Stage 141:>  (0 + 0) / 2][Stage 142:>  (0 + 0) / 2]  

25/05/10 02:30:16 ERROR TaskSetManager: Task 1 in stage 137.0 failed 4 times; aborting job
25/05/10 02:30:16 WARN TaskSetManager: Lost task 0.3 in stage 137.0 (TID 269) (lbdwo17.datalab.novalocal executor 1): TaskKilled (Stage cancelled)
25/05/10 02:30:16 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB


[Stage 143:>  (0 + 1) / 2][Stage 144:=> (1 + 1) / 2][Stage 146:>  (0 + 0) / 2]2]

25/05/10 02:30:35 WARN TaskSetManager: Lost task 0.0 in stage 143.0 (TID 276) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 143:>  (0 + 2) / 2][Stage 145:>  (0 + 0) / 2][Stage 146:>  (0 + 0) / 2]2]

25/05/10 02:30:36 WARN TaskSetManager: Lost task 1.0 in stage 143.0 (TID 277) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 143:>  (0 + 1) / 2][Stage 145:>  (0 + 1) / 2][Stage 146:>  (0 + 0) / 2]

25/05/10 02:30:39 WARN TaskSetManager: Lost task 0.0 in stage 145.0 (TID 279) (lbdwo15.datalab.novalocal executor 2): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 143:>  (0 + 1) / 2][Stage 145:>  (0 + 1) / 2][Stage 146:>  (0 + 0) / 2]

25/05/10 02:30:46 WARN TaskSetManager: Lost task 0.3 in stage 143.0 (TID 284) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 2.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 145:>                (0 + 1) / 2][Stage 146:>                (0 + 1) / 2]

25/05/10 02:30:48 WARN TaskSetManager: Lost task 1.0 in stage 145.0 (TID 287) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha

[Stage 147:>                                                        (0 + 2) / 2]

25/05/10 02:31:03 WARN TaskSetManager: Lost task 1.0 in stage 147.0 (TID 293) (lbdwo17.datalab.novalocal executor 1): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 3.
	at org.apache.spark.mllib.stat.test.ChiSqTest$.computeChiSq(ChiSqTest.scala:195)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.$anonfun$chiSquaredSparseFeatures$15(ChiSqTest.scala:175)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.ha