In [1]:
#注意事项:
#当运行本Notebook的程序后，如果要关闭Notebook，请选择菜单: File > Close and Halt 才能确实停止当前正在运行的程序，并且释放资源
#如果没有使用以上方法，只关闭此分页，程序仍在运行，未释放资源，当您打开并运行其他的Notebook，可能会发生错误

# 22	Spark ML Pipeline 回归分析

# 22.1	数据准备

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("ML multi").setMaster("local[4]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

In [2]:
sc.master

'local[4]'

In [10]:
global Path    
if sc.master[0:5]=="local" :
   Path="/mnt/data1/workspace/data_analysis_mining/Python+Spark2.0+Hadoop机器学习与大数据实战/pythonsparkexample/PythonProject/"
else:   
   Path="hdfs://master:9000/user/hduser/"
#如果要在cluster模式运行(hadoop yarn 或Spark Stand alone)，请按照书上的说明，先把文件上传到HDFS目录

In [11]:
hour_df= spark.read.format('csv') \
                  .option("header", 'true').load(Path+"data/hour.csv")
hour_df.count()

17379

In [12]:
print(hour_df.columns)

['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']


In [13]:
hour_df = hour_df.drop("instant").drop('dteday').drop('yr').drop('casual').drop('registered')

In [14]:
print(hour_df.printSchema())

root
 |-- season: string (nullable = true)
 |-- mnth: string (nullable = true)
 |-- hr: string (nullable = true)
 |-- holiday: string (nullable = true)
 |-- weekday: string (nullable = true)
 |-- workingday: string (nullable = true)
 |-- weathersit: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- atemp: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- windspeed: string (nullable = true)
 |-- cnt: string (nullable = true)

None


In [15]:
from pyspark.sql.functions import col  

In [16]:
hour_df= hour_df.select([col(column).cast("double").alias(column) 
                        for column in hour_df.columns])

In [17]:
hour_df.printSchema()

root
 |-- season: double (nullable = true)
 |-- mnth: double (nullable = true)
 |-- hr: double (nullable = true)
 |-- holiday: double (nullable = true)
 |-- weekday: double (nullable = true)
 |-- workingday: double (nullable = true)
 |-- weathersit: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: double (nullable = true)



In [18]:
hour_df.show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|   1.0| 1.0|0.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.81|      0.0|16.0|
|   1.0| 1.0|1.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|40.0|
|   1.0| 1.0|2.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|32.0|
|   1.0| 1.0|3.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0|13.0|
|   1.0| 1.0|4.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0| 1.0|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 5 rows



In [19]:
train_df, test_df = hour_df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

DataFrame[season: double, mnth: double, hr: double, holiday: double, weekday: double, workingday: double, weathersit: double, temp: double, atemp: double, hum: double, windspeed: double, cnt: double]

# 22.2	建立机器学习pipeline管线

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, VectorIndexer,VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

In [21]:
featuresCols = hour_df.columns[:-1]
print(featuresCols)

['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']


In [71]:
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([-1.0, 0.5]),),
                            (Vectors.dense([0.0, 1.0]),), 
                            (Vectors.dense([0.0, 2.0]),),
                            (Vectors.dense([1.0, 4.0]),),
                            ],
                           ["a"])

In [72]:
df.show()

+----------+
|         a|
+----------+
|[-1.0,0.5]|
| [0.0,1.0]|
| [0.0,2.0]|
| [1.0,4.0]|
+----------+



In [73]:
indexer = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
model = indexer.fit(df)

In [74]:
model.transform(df).show()

+----------+---------+
|         a|  indexed|
+----------+---------+
|[-1.0,0.5]|[1.0,0.5]|
| [0.0,1.0]|[0.0,1.0]|
| [0.0,2.0]|[0.0,2.0]|
| [1.0,4.0]|[2.0,4.0]|
+----------+---------+



In [22]:
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="aFeatures")
# 将星期 月份 小时 等会被视为分类字段
vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)
dt = DecisionTreeRegressor(labelCol="cnt",featuresCol= 'features')
dt_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,dt])

In [23]:
dt_pipeline.getStages()

[VectorAssembler_6413f133859f,
 VectorIndexer_a98677683853,
 DecisionTreeRegressor_109fbc85c5f7]

# 22.3	使用pipeline进行数据处理与训练

In [24]:
dt_pipelineModel = dt_pipeline.fit(train_df)

In [25]:
dt_pipelineModel.stages[2]

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_109fbc85c5f7) of depth 5 with 63 nodes

In [26]:
print(dt_pipelineModel.stages[2].toDebugString[:500])

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_109fbc85c5f7) of depth 5 with 63 nodes
  If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,22.0,23.0})
   If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0})
    If (feature 2 in {2.0,3.0,4.0,5.0})
     If (feature 4 in {1.0,2.0,3.0,4.0,5.0})
      If (feature 2 in {2.0,3.0,4.0})
       Predict: 6.790719696969697
      Else (feature 2 not in {2.0,3.0,4.0})
       Predict: 24.738292011019283
     Else (feature 4 not in {1.0,2.0,3.0,4.0,5.0})
      If (fe


# 22.4	使用pipelineModel 进行预测

In [27]:
predicted_df=dt_pipelineModel.transform(test_df)

In [28]:
print(predicted_df.columns)

['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt', 'aFeatures', 'features', 'prediction']


In [29]:
predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', \
                     'weathersit', 'temp', 'atemp', 'hum', 'windspeed','cnt','prediction').show(10)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+-----------------+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|       prediction|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+-----------------+
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.38|0.3939| 0.4|   0.2836|91.0|54.08571428571429|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       1.0|0.12|0.1212| 0.5|   0.2836| 5.0|37.77808988764045|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.18|0.2424|0.86|      0.0|19.0|37.77808988764045|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.24|0.2273|0.65|   0.2239| 7.0|37.77808988764045|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.32|0.2879|0.26|   0.4179|10.0|37.77808988764045|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|       2.0|0.22|0.2424|0.87|   0.1045|14.0|37.77808988764045|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|  

# 22.5	评估模型的准确率

In [30]:
from pyspark.ml.evaluation import RegressionEvaluator

In [31]:
evaluator = RegressionEvaluator(labelCol='cnt',
                                                        predictionCol='prediction',
                                                        metricName="rmse")

In [32]:
predicted_df=dt_pipelineModel.transform(test_df)
rmse = evaluator.evaluate(predicted_df)
rmse

98.3475020549714

# 22.6	使用TrainValidation进行训练评估找出最佳模型

In [40]:
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

In [41]:
paramGrid = ParamGridBuilder()\
  .addGrid(dt.maxDepth, [ 5,10,15,25])\
  .addGrid(dt.maxBins, [25,35,45,50])\
  .build()

In [35]:
tvs = TrainValidationSplit(estimator=dt,evaluator=evaluator,
                  estimatorParamMaps=paramGrid,trainRatio=0.8)

In [36]:
tvs_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,tvs])

In [37]:
tvs_pipelineModel =tvs_pipeline.fit(train_df)

In [38]:
bestModel=tvs_pipelineModel.stages[2].bestModel
print(bestModel.toDebugString[:500])

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_109fbc85c5f7) of depth 10 with 1803 nodes
  If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,22.0,23.0})
   If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0})
    If (feature 2 in {2.0,3.0,4.0,5.0})
     If (feature 4 in {1.0,2.0,3.0,4.0,5.0})
      If (feature 2 in {2.0,3.0,4.0})
       If (feature 2 in {3.0,4.0})
        If (feature 1 in {0.0,1.0,2.0,3.0,11.0})
         If (feature 7 <= 0.41000000000000003)
          If (feature 0 in {0.0,1.0})
    


In [39]:
predictions = tvs_pipelineModel.transform(test_df)
rmse= evaluator.evaluate(predictions)
rmse

81.27945069184695

# 22.7	使用crossValidation进行训练评估找出最佳模型

In [42]:
from pyspark.ml.tuning import CrossValidator

In [43]:
cv = CrossValidator(estimator=dt, evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, numFolds=3)

In [44]:
cv_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,cv])

In [45]:
cv_pipelineModel = cv_pipeline.fit(train_df)

In [37]:
predictions = cv_pipelineModel.transform(test_df)
rmse= evaluator.evaluate(predictions)
rmse

81.13273892536569

# 22.8	使用随机森林RandomForestClassifier分类器

In [46]:
from pyspark.ml.regression import RandomForestRegressor
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="oFeatures")
vectorIndexer = VectorIndexer(inputCol="oFeatures", outputCol="features", maxCategories=24)
rf= RandomForestRegressor(labelCol="cnt",featuresCol= 'features', numTrees=20)
rf_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,rf])
rf_pipelineModel = rf_pipeline.fit(train_df)
predicted_df=rf_pipelineModel.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol='cnt', predictionCol='prediction')
rmse = evaluator.evaluate(predicted_df)
rmse

103.14404734402089

In [47]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
paramGrid = ParamGridBuilder()\
  .addGrid(rf.maxDepth, [ 5,10,15])\
  .addGrid(rf.maxBins, [25,35,50])\
  .addGrid(rf.numTrees, [10, 20,30])\
  .build()

rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator,
                                 estimatorParamMaps=paramGrid, trainRatio=0.8)

rftvs_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer, rftvs])
rftvs_pipelineModel =rftvs_pipeline.fit(train_df)
rftvspredictions = rftvs_pipelineModel.transform(test_df)
rmse= evaluator.evaluate(rftvspredictions)
rmse

70.73193772509443

In [40]:
# 22.8	使用GBT

In [48]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(labelCol="cnt",featuresCol= 'features')
gbt_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer,gbt])

In [49]:
gbt_pipelineModel = gbt_pipeline.fit(train_df)
predicted_df=gbt_pipelineModel.transform(test_df)
rmse = evaluator.evaluate(predicted_df)
rmse

76.9287732767273

In [50]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

paramGrid = ParamGridBuilder() \
  .addGrid(gbt.maxDepth, [ 5,10])\
  .addGrid(gbt.maxBins, [25,40])\
  .addGrid(gbt.maxIter, [10, 50])\
  .build()

cv = CrossValidator(estimator=gbt, evaluator=evaluator, 
                                  estimatorParamMaps=paramGrid, numFolds=3)
cv_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

In [51]:
cv_pipelineModel = cv_pipeline.fit(train_df)

In [52]:
cvm=cv_pipelineModel.stages[2] 
gbestModel=cvm.bestModel
print(bestModel.toDebugString[:500])

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_109fbc85c5f7) of depth 10 with 1803 nodes
  If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,22.0,23.0})
   If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0})
    If (feature 2 in {2.0,3.0,4.0,5.0})
     If (feature 4 in {1.0,2.0,3.0,4.0,5.0})
      If (feature 2 in {2.0,3.0,4.0})
       If (feature 2 in {3.0,4.0})
        If (feature 1 in {0.0,1.0,2.0,3.0,11.0})
         If (feature 7 <= 0.41000000000000003)
          If (feature 0 in {0.0,1.0})
    


In [53]:
predicted_df=cv_pipelineModel.transform(test_df)
predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', \
                     'weathersit', 'temp', 'atemp', 'hum', 'windspeed','cnt','prediction').show(10)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|        prediction|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.38|0.3939| 0.4|   0.2836|91.0| 74.16023695702242|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       1.0|0.12|0.1212| 0.5|   0.2836| 5.0| 12.88746098409448|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.18|0.2424|0.86|      0.0|19.0| 27.49454199939149|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.24|0.2273|0.65|   0.2239| 7.0|20.909606077888743|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       2.0|0.32|0.2879|0.26|   0.4179|10.0|28.601369929027847|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|       2.0|0.22|0.2424|0.87|   0.1045|14.0| 25.08655733253785|
|   1.0| 1.0|0.0|    0.0|    2.0|    

In [54]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol='cnt', predictionCol='prediction')
rmse = evaluator.evaluate(predicted_df)
rmse

72.09077346060839