In [2]:
#注意事项:
#当运行本Notebook的程序后，如果要关闭Notebook，请选择菜单: File > Close and Halt 才能确实停止当前正在运行的程序，并且释放资源
#如果没有使用以上方法，只关闭此分页，程序仍在运行，未释放资源，当您打开并运行其他的Notebook，可能会发生错误

# 20	Spark ML Pipeline 二元分类机器学习

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("ML classifier").setMaster('local[4]')
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

In [2]:
sc.master

'local[4]'

# 20.1	数据准备

In [3]:
global Path    
if sc.master[0:5]=="local" :
   Path="file:/mnt/data1/workspace/data_analysis_mining/Python+Spark2.0+Hadoop机器学习与大数据实战/pythonsparkexample/PythonProject/"
else:   
   Path="hdfs://master:9000/user/hduser/"
#如果要在cluster模式运行(hadoop yarn 或Spark Stand alone)，请按照书上的说明，先把文件上传到HDFS目录

In [5]:
row_df = spark.read.csv(Path+"data/train.tsv", header=True, sep='\t')
row_df.count()

7395

In [5]:
row_df = sqlContext.read.format("csv") \
     .option("header", "true") \
     .option("delimiter", "\t") \
     .load(Path+"data/train.tsv")
print row_df.count()    

7395


In [6]:
row_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
 |-- commonlinkratio_4: string (nullable = true)
 |-- compression_ratio: string (nullable = true)
 |-- embed_ratio: string (nullable = true)
 |-- framebased: string (nullable = true)
 |-- frameTagRatio: string (nullable = true)
 |-- hasDomainLink: string (nullable = true)
 |-- html_ratio: string (nullable = true)
 |-- image_ratio: string (nullable = true)
 |-- is_news: string (nullable = true)
 |-- lengthyLinkDomain: string (nullable = true)
 |-- linkwordscore: string (nullable = true)
 |-- news_front_page: string (nullable = true)
 |-- non_markup_alphanum_characters: string (nulla

In [7]:
row_df.select('url','alchemy_category','alchemy_category_score','is_news','label').show(10)

+--------------------+------------------+----------------------+-------+-----+
|                 url|  alchemy_category|alchemy_category_score|is_news|label|
+--------------------+------------------+----------------------+-------+-----+
|http://www.bloomb...|          business|              0.789131|      1|    0|
|http://www.popsci...|        recreation|              0.574147|      1|    1|
|http://www.menshe...|            health|              0.996526|      1|    1|
|http://www.dumbli...|            health|              0.801248|      1|    1|
|http://bleacherre...|            sports|              0.719157|      1|    0|
|http://www.conven...|                 ?|                     ?|      ?|    0|
|http://gofashionl...|arts_entertainment|               0.22111|      1|    1|
|http://www.inside...|                 ?|                     ?|      ?|    0|
|http://www.valetm...|                 ?|                     ?|      1|    1|
|http://www.howswe...|                 ?|           

In [9]:
# 将? 全部转为0
from pyspark.sql.functions import udf
def replace_question(x):
    return "0" if x == "?" else x
replace_question = udf(replace_question)

In [10]:
from pyspark.sql.functions import col  
import pyspark.sql.types as typ

In [11]:
# 先替换? 在转为double 类型,
df = row_df.select(
    ['url', 'alchemy_category']+
    [replace_question(col(column)).cast(typ.DoubleType()).alias(column) 
        for column in row_df.columns[4:]
    
    ])

In [13]:
df.printSchema()

root
 |-- url: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: double (nullable = true)
 |-- avglinksize: double (nullable = true)
 |-- commonlinkratio_1: double (nullable = true)
 |-- commonlinkratio_2: double (nullable = true)
 |-- commonlinkratio_3: double (nullable = true)
 |-- commonlinkratio_4: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- embed_ratio: double (nullable = true)
 |-- framebased: double (nullable = true)
 |-- frameTagRatio: double (nullable = true)
 |-- hasDomainLink: double (nullable = true)
 |-- html_ratio: double (nullable = true)
 |-- image_ratio: double (nullable = true)
 |-- is_news: double (nullable = true)
 |-- lengthyLinkDomain: double (nullable = true)
 |-- linkwordscore: double (nullable = true)
 |-- news_front_page: double (nullable = true)
 |-- non_markup_alphanum_characters: double (nullable = true)
 |-- numberOfLinks: double (nullable = true)
 |-- numwords_in_url: d

In [14]:
df.select('url','alchemy_category','alchemy_category_score','is_news','label').show(10)

+--------------------+------------------+----------------------+-------+-----+
|                 url|  alchemy_category|alchemy_category_score|is_news|label|
+--------------------+------------------+----------------------+-------+-----+
|http://www.bloomb...|          business|              0.789131|    1.0|  0.0|
|http://www.popsci...|        recreation|              0.574147|    1.0|  1.0|
|http://www.menshe...|            health|              0.996526|    1.0|  1.0|
|http://www.dumbli...|            health|              0.801248|    1.0|  1.0|
|http://bleacherre...|            sports|              0.719157|    1.0|  0.0|
|http://www.conven...|                 ?|                   0.0|    0.0|  0.0|
|http://gofashionl...|arts_entertainment|               0.22111|    1.0|  1.0|
|http://www.inside...|                 ?|                   0.0|    0.0|  0.0|
|http://www.valetm...|                 ?|                   0.0|    1.0|  1.0|
|http://www.howswe...|                 ?|           

In [15]:
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

DataFrame[url: string, alchemy_category: string, alchemy_category_score: double, avglinksize: double, commonlinkratio_1: double, commonlinkratio_2: double, commonlinkratio_3: double, commonlinkratio_4: double, compression_ratio: double, embed_ratio: double, framebased: double, frameTagRatio: double, hasDomainLink: double, html_ratio: double, image_ratio: double, is_news: double, lengthyLinkDomain: double, linkwordscore: double, news_front_page: double, non_markup_alphanum_characters: double, numberOfLinks: double, numwords_in_url: double, parametrizedLinkRatio: double, spelling_errors_ratio: double, label: double]

# 20.2	介绍数据处理的pipeline管线的组件

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

In [17]:
# 将文字的分类特征转为数字
categoryIndexer = StringIndexer(inputCol='alchemy_category',
                                outputCol='alchemy_category_Index')

In [18]:
categoryTransformer=categoryIndexer.fit(df)

In [20]:
categoryTransformer.labels

['?',
 'recreation',
 'arts_entertainment',
 'business',
 'health',
 'sports',
 'culture_politics',
 'computer_internet',
 'science_technology',
 'gaming',
 'religion',
 'law_crime',
 'unknown',
 'weather']

In [21]:
df1 = categoryTransformer.transform(df)

In [22]:
df1.columns

['url',
 'alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label',
 'alchemy_category_Index']

In [23]:
df1.select("alchemy_category","alchemy_category_Index").show(10)

+------------------+----------------------+
|  alchemy_category|alchemy_category_Index|
+------------------+----------------------+
|          business|                   3.0|
|        recreation|                   1.0|
|            health|                   4.0|
|            health|                   4.0|
|            sports|                   5.0|
|                 ?|                   0.0|
|arts_entertainment|                   2.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
+------------------+----------------------+
only showing top 10 rows



In [22]:
from pyspark.ml.feature import  OneHotEncoder

In [24]:
# alchemy_category_Index 进行onehot处理
encoder = OneHotEncoder(dropLast=False,
                        inputCol='alchemy_category_Index', 
                        outputCol="alchemy_category_IndexVec")

In [25]:
df2=encoder.transform(df1)
df2.columns

['url',
 'alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label',
 'alchemy_category_Index',
 'alchemy_category_IndexVec']

In [26]:
df2.select("alchemy_category","alchemy_category_Index",
                    "alchemy_category_IndexVec").show(10)

+------------------+----------------------+-------------------------+
|  alchemy_category|alchemy_category_Index|alchemy_category_IndexVec|
+------------------+----------------------+-------------------------+
|          business|                   3.0|           (14,[3],[1.0])|
|        recreation|                   1.0|           (14,[1],[1.0])|
|            health|                   4.0|           (14,[4],[1.0])|
|            health|                   4.0|           (14,[4],[1.0])|
|            sports|                   5.0|           (14,[5],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|arts_entertainment|                   2.0|           (14,[2],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
+------------------+----------------------+-------------------------+
only showing top 10 

In [26]:
from pyspark.ml.feature import  VectorAssembler

In [27]:
# 要将多个列整合到一个向量
assemblerInputs =['alchemy_category_IndexVec']  +  row_df.columns[4:-1]
assemblerInputs

['alchemy_category_IndexVec',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio']

In [28]:
assembler = VectorAssembler(inputCols=assemblerInputs,  
                            outputCol="features")

In [29]:
df3=assembler.transform(df2)

In [30]:
df3.columns

['url',
 'alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label',
 'alchemy_category_Index',
 'alchemy_category_IndexVec',
 'features']

In [31]:
df3.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|(36,[3,14,15,16,1...|
|(36,[1,14,15,16,1...|
|(36,[4,14,15,16,1...|
|(36,[4,14,15,16,1...|
|(36,[5,14,15,16,1...|
+--------------------+
only showing top 5 rows



In [32]:
df3.select('features').take(1)

[Row(features=SparseVector(36, {3: 1.0, 14: 0.7891, 15: 2.0556, 16: 0.6765, 17: 0.2059, 18: 0.0471, 19: 0.0235, 20: 0.4438, 23: 0.0908, 25: 0.2458, 26: 0.0039, 27: 1.0, 28: 1.0, 29: 24.0, 31: 5424.0, 32: 170.0, 33: 8.0, 34: 0.1529, 35: 0.0791}))]

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier

In [33]:
dt = DecisionTreeClassifier(labelCol="label",  featuresCol="features",
                            impurity="gini",maxDepth=10, maxBins=14)

In [34]:
dt_model=dt.fit(df3)
dt_model

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_b8a5cc995160) of depth 10 with 653 nodes

In [35]:
df4=dt_model.transform(df3)

# 20.3	建立数据处理的pipeline管线

In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

In [37]:
stringIndexer = StringIndexer(inputCol='alchemy_category', 
                              outputCol="alchemy_category_Index")
encoder = OneHotEncoder(dropLast=False,
                        inputCol='alchemy_category_Index',
                        outputCol="alchemy_category_IndexVec")
assemblerInputs =['alchemy_category_IndexVec']  + row_df.columns[4:-1] 
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",impurity="gini",
                             maxDepth=10, maxBins=14)
pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, dt])

In [38]:
pipeline.getStages()

[StringIndexer_16a542320c9e,
 OneHotEncoder_c695810de035,
 VectorAssembler_c87a24eefcd0,
 DecisionTreeClassifier_bf207176fa65]

# 20.4	使用pipeline进行训练

In [48]:
pipelineModel = pipeline.fit(train_df)

In [49]:
pipelineModel.stages[3]

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_bf207176fa65) of depth 10 with 501 nodes

In [43]:
print(pipelineModel.stages[3].toDebugString[:1000])

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_bf207176fa65) of depth 10 with 501 nodes
  If (feature 31 <= 1832.0)
   If (feature 23 <= 0.035596906)
    If (feature 16 <= 0.7569093425)
     If (feature 26 <= 0.1085317665)
      If (feature 26 <= 0.0089726355)
       If (feature 33 <= 5.5)
        If (feature 35 <= 0.167132216)
         If (feature 1 in {0.0})
          Predict: 0.0
         Else (feature 1 not in {0.0})
          Predict: 1.0
        Else (feature 35 > 0.167132216)
         If (feature 34 <= 0.0885214845)
          If (feature 25 <= 0.20540665800000002)
           Predict: 1.0
          Else (feature 25 > 0.20540665800000002)
           If (feature 25 <= 0.30533927299999997)
            Predict: 0.0
           Else (feature 25 > 0.30533927299999997)
            Predict: 1.0
         Else (feature 34 > 0.0885214845)
          If (feature 23 <= 0.0218875075)
           Predict: 0.0
          Else (feature 23 > 0.0218875075)
           Predict: 1.0
       El

# 20.5	使用pipeline 进行预测

In [44]:
predicted=pipelineModel.transform(test_df)

In [45]:
predicted.columns  # rawPrediction probability prediction

['url',
 'alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label',
 'alchemy_category_Index',
 'alchemy_category_IndexVec',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [46]:
predicted.select('url','features','rawprediction','probability','label','prediction').show(10)

+--------------------+--------------------+-------------+--------------------+-----+----------+
|                 url|            features|rawprediction|         probability|label|prediction|
+--------------------+--------------------+-------------+--------------------+-----+----------+
|http://1000awesom...|(36,[2,14,15,16,1...|  [15.0,63.0]|[0.19230769230769...|  1.0|       1.0|
|http://13gb.com/p...|(36,[0,15,16,20,2...|  [97.0,46.0]|[0.67832167832167...|  0.0|       0.0|
|http://3kidsandus...|(36,[0,15,16,17,1...|  [9.0,103.0]|[0.08035714285714...|  1.0|       1.0|
|http://3ppp.info/...|(36,[1,14,15,16,1...|   [16.0,0.0]|           [1.0,0.0]|  1.0|       0.0|
|http://9gag.com/g...|(36,[0,15,16,17,1...|  [66.0,46.0]|[0.58928571428571...|  1.0|       0.0|
|http://9gag.com/g...|(36,[0,15,16,17,1...|  [66.0,46.0]|[0.58928571428571...|  0.0|       0.0|
|http://9gg.us/hah...|(36,[12,14,15,20,...| [119.0,41.0]|   [0.74375,0.25625]|  0.0|       0.0|
|http://aftenposte...|(36,[0,15,16,17,1.

In [47]:
predicted.select('probability','prediction') .take(10)

[Row(probability=DenseVector([0.1923, 0.8077]), prediction=1.0),
 Row(probability=DenseVector([0.6783, 0.3217]), prediction=0.0),
 Row(probability=DenseVector([0.0804, 0.9196]), prediction=1.0),
 Row(probability=DenseVector([1.0, 0.0]), prediction=0.0),
 Row(probability=DenseVector([0.5893, 0.4107]), prediction=0.0),
 Row(probability=DenseVector([0.5893, 0.4107]), prediction=0.0),
 Row(probability=DenseVector([0.7438, 0.2562]), prediction=0.0),
 Row(probability=DenseVector([0.6154, 0.3846]), prediction=0.0),
 Row(probability=DenseVector([0.595, 0.405]), prediction=0.0),
 Row(probability=DenseVector([0.0741, 0.9259]), prediction=1.0)]

# 20.6	评估模型的准确率

In [51]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [52]:
evaluator = BinaryClassificationEvaluator(
                              rawPredictionCol="rawPrediction",
                              labelCol="label",  
                              metricName="areaUnderROC"  )

In [53]:
predictions =pipelineModel.transform(test_df)
auc= evaluator.evaluate(predictions)
auc

0.6404064868787183

# 20.7	使用TrainValidation进行训练评估找出最佳模型

In [54]:
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

In [55]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [56]:
paramGrid = ParamGridBuilder()\
  .addGrid(dt.impurity, [ "gini","entropy"])\
  .addGrid(dt.maxDepth, [ 5,10,15])\
  .addGrid(dt.maxBins, [10, 15,20])\
  .build()

In [57]:
tvs = TrainValidationSplit(estimator=dt,evaluator=evaluator,
                  estimatorParamMaps=paramGrid,trainRatio=0.8)

In [58]:
tvs_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, tvs])

In [59]:
tvs_pipelineModel =tvs_pipeline.fit(train_df)

In [60]:
bestModel=tvs_pipelineModel.stages[3].bestModel
bestModel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ede028bdc05a) of depth 15 with 1601 nodes

In [61]:
print(bestModel.toDebugString[:500])

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ede028bdc05a) of depth 15 with 1601 nodes
  If (feature 31 <= 1955.0)
   If (feature 31 <= 1246.0)
    If (feature 4 in {1.0})
     If (feature 20 <= 0.6227490865)
      If (feature 20 <= 0.43302699450000004)
       If (feature 15 <= 1.907002801)
        Predict: 1.0
       Else (feature 15 > 1.907002801)
        Predict: 0.0
      Else (feature 20 > 0.43302699450000004)
       Predict: 1.0
     Else (feature 20 > 0.6227490865)
      If


In [62]:
predictions = tvs_pipelineModel.transform(test_df)
auc= evaluator.evaluate(predictions)
auc

0.655512752470735

# 20.8	使用crossValidation进行训练评估找出最佳模型

In [63]:
from pyspark.ml.tuning import CrossValidator

In [64]:
cv = CrossValidator(estimator=dt, evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, numFolds=3)

In [65]:
cv_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, cv])

In [66]:
cv_pipelineModel = cv_pipeline.fit(train_df)

In [67]:
bestModel=cv_pipelineModel.stages[3].bestModel
bestModel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ede028bdc05a) of depth 15 with 1449 nodes

In [68]:
print(bestModel.toDebugString[:500])

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ede028bdc05a) of depth 15 with 1449 nodes
  If (feature 31 <= 1611.5)
   If (feature 4 in {1.0})
    If (feature 31 <= 239.5)
     If (feature 23 <= 0.012195692000000001)
      Predict: 1.0
     Else (feature 23 > 0.012195692000000001)
      If (feature 35 <= 0.032965129999999995)
       Predict: 1.0
      Else (feature 35 > 0.032965129999999995)
       Predict: 0.0
    Else (feature 31 > 239.5)
     If (feature 35 <= 0.0475698544999999


In [69]:
predictions = cv_pipelineModel.transform(test_df)
auc= evaluator.evaluate(predictions)
auc

0.659141269669929

# 20.9	使用RandomForestClassifier分类器

In [70]:
from pyspark.ml.classification import RandomForestClassifier

rf =RandomForestClassifier(labelCol="label", 
                           featuresCol="features",numTrees=10)

rfpipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rf])

In [71]:
rfpipelineModel = rfpipeline.fit(train_df)
rfpredicted=rfpipelineModel.transform(test_df)
evaluator.evaluate(rfpredicted)

0.7338259061360592

In [72]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

paramGrid = ParamGridBuilder()\
  .addGrid(rf.impurity, [ "gini","entropy"])\
  .addGrid(rf.maxDepth, [ 5,10,15])\
  .addGrid(rf.maxBins, [10, 15,20])\
  .addGrid(rf.numTrees, [10, 20,30])\
  .build()

rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator,
                                 estimatorParamMaps=paramGrid, trainRatio=0.8)

rftvs_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rftvs])
rftvs_pipelineModel =rftvs_pipeline.fit(train_df)
rftvspredictions = rftvs_pipelineModel.transform(test_df)
auc= evaluator.evaluate(rftvspredictions)
auc

0.7641141485559393

In [74]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

rfcv = CrossValidator(estimator=rf, evaluator=evaluator, 
                          estimatorParamMaps=paramGrid, numFolds=3)

rfcv_pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler, rfcv])
rfcv_pipelineModel = rfcv_pipeline.fit(train_df)

In [75]:
rfcvpredictions = rfcv_pipelineModel.transform(test_df)

In [76]:
DescDict = {
           0: "暂时性网页(ephemeral)",
           1: "长青网页(evergreen)"
     }
for data in rfcvpredictions.select('url','prediction').take(5):
     print("网址: {}\n 预测: {} 说明为: {}".format(
          data[0], data[1], DescDict[data[1]]
     ))

网址: http://1000awesomethings.com/2008/12/29/864-mastering-the-art-of-the-all-you-can-eat-buffet/
 预测: 1.0 说明为: 长青网页(evergreen)
网址: http://13gb.com/pictures/4004/
 预测: 0.0 说明为: 暂时性网页(ephemeral)
网址: http://3kidsandus.com/red-velvet-rice-krispies-treats-hearts-for-valentines-day/
 预测: 1.0 说明为: 长青网页(evergreen)
网址: http://3ppp.info/homemade-gravlax-for-breakfast-it-turned-out-great/
 预测: 0.0 说明为: 暂时性网页(ephemeral)
网址: http://9gag.com/gag/107718
 预测: 0.0 说明为: 暂时性网页(ephemeral)


In [77]:
auc= evaluator.evaluate(rfcvpredictions)
auc

0.7643281639800424