In [6]:
from pyspark.sql import SparkSession,Row,functions
from pyspark import SparkContext
from ast import literal_eval
from pyspark.ml.linalg import Vector,Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer,IDF
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel,BinaryLogisticRegressionSummary, LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.getOrCreate()
cleanedData=spark.sparkContext.textFile("tokenizedData").map(lambda r:literal_eval(str(r)))
df=cleanedData.toDF(["label","words"])
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featuredData=hashingTF.transform(df)
idf=IDF(inputCol="rawFeatures",outputCol="features")
idfModel=idf.fit(featuredData)
rescaledData=idfModel.transform(featuredData)
rescaledData.select("label","features").show()
inputData=rescaledData

#index label and feature
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(inputData)
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(inputData)
#split the inputData into trainingData and testData
training, test = inputData.randomSplit([0.7,0.3])
#lr will train the model
lr = LogisticRegression(maxIter=10)
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
#lr pipeline set
pipeline =  Pipeline().setStages([labelIndexer, featureIndexer])
training=pipeline.fit(training).transform(training)

#model trained
paramGrid = ParamGridBuilder()\
            .addGrid(lr.regParam, [0.1, 0.01])\
            .addGrid(lr.elasticNetParam,[0.1,0.5,0.9])\
            .build()
crossval=CrossValidator(estimator=lr,\
                          estimatorParamMaps=paramGrid,\
                          evaluator=MulticlassClassificationEvaluator(),\
                          numFolds=3)
cvModel=crossval.fit(training)
cvModel.bestModel.summary

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(20,[0,1,2,4,5,7,...|
|    2|(20,[0,1,2,4,5,6,...|
|    2|(20,[0,1,2,3,4,5,...|
|    2|(20,[0,1,2,3,4,5,...|
|    2|(20,[0,1,2,5,8,9,...|
|    1|(20,[0,1,2,4,5,7,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,2,3,4,5,6,...|
|    1|(20,[0,2,3,4,5,7,...|
|    1|(20,[0,2,3,4,5,6,...|
|    1|(20,[0,2,3,4,5,8,...|
|    1|(20,[0,2,3,4,5,6,...|
|    1|(20,[2,3,4,5,7,8,...|
|    1|(20,[0,2,4,5,6,7,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,1,2,3,4,5,...|
|    1|(20,[0,1,2,3,4,5,...|
+-----+--------------------+
only showing top 20 rows



<pyspark.ml.classification.LogisticRegressionTrainingSummary at 0x1133a9e80>

In [7]:
cvModel.save('cvModel')

In [9]:
from pyspark.ml.tuning import CrossValidatorModel
best=CrossValidatorModel.load('cvModel').bestModel
best.summary

RuntimeError: No training summary available for this LogisticRegressionModel