In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
conf.set("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

### 逻辑回归

#### 导入数据

In [2]:
from pyspark.ml.classification import LogisticRegression

# Load training data
training = spark.read.format("libsvm").option("numFeatures", "780").load(r"D:\spark\data\mllib\sample_libsvm_data.txt")


In [17]:
training.select("features").show(5,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### 模型实例化

In [3]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

#### 训练模型

In [4]:
# Fit the model
lrModel = lr.fit(training)

In [6]:
lrModel.coefficientMatrix

SparseMatrix(1, 780, [0, 31], [244, 263, 272, 300, 301, 328, 350, 351, ..., 490, 496, 511, 512, 517, 539, 540, 568], [-0.0001, -0.0001, -0.0002, -0.0002, -0.0, -0.0001, 0.0, 0.0, ..., 0.0003, -0.0001, -0.0004, -0.0003, 0.0003, -0.0002, -0.0015, -0.0002], 1)

#### 打印相关参数

In [5]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (780,[244,263,272,300,301,328,350,351,378,379,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.353983524188197e-05,-9.102738505589466e-05,-0.00019467430546904298,-0.00020300642473486668,-3.1476183314863995e-05,-6.842977602660743e-05,1.5883626898239883e-05,1.4023497091372047e-05,0.00035432047524968605,0.00011443272898171087,0.00010016712383666666,0.0006014109303795481,0.0002840248179122762,-0.00011541084736508837,0.000385996886312906,0.000635019557424107,-0.00011506412384575676,-0.00015271865864986808,0.0002804933808994214,0.0006070117471191634,-0.0002008459663247437,-0.0001421075579290126,0.0002739010341160883,0.00027730456244968115,-9.838027027269332e-05,-0.0003808522443517704,-0.00025315198008555033,0.00027747714770754307,-0.0002443619763919199,-0.0015394744687597765,-0.00023073328411331293])
Intercept: 0.22456315961250325


#### 另外一种方式

In [8]:
# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(training)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

Multinomial coefficients: 2 X 780 CSRMatrix
(0,244) 0.0
(0,263) 0.0001
(0,272) 0.0001
(0,300) 0.0001
(0,350) -0.0
(0,351) -0.0
(0,378) -0.0
(0,379) -0.0
(0,405) -0.0
(0,406) -0.0006
(0,407) -0.0001
(0,428) 0.0001
(0,433) -0.0
(0,434) -0.0007
(0,455) 0.0001
(0,456) 0.0001
..
..
Multinomial intercepts: [-0.12065879445860686,0.12065879445860686]


#### 模型训练信息

In [18]:
from pyspark.ml.classification import LogisticRegression

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
0.6833149135741672
0.6662875751473734
0.6217068546034618
0.6127265245887887
0.6060347986802873
0.6031750687571562
0.5969621534836274
0.5940743031983118
0.5906089243339022
0.5894724576491042
0.5882187775729587


#### 计算auc

In [20]:
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.017543859649122806|
|0.0| 0.03508771929824561|
|0.0| 0.05263157894736842|
|0.0| 0.07017543859649122|
|0.0| 0.08771929824561403|
|0.0| 0.10526315789473684|
|0.0| 0.12280701754385964|
|0.0| 0.14035087719298245|
|0.0| 0.15789473684210525|
|0.0| 0.17543859649122806|
|0.0| 0.19298245614035087|
|0.0| 0.21052631578947367|
|0.0| 0.22807017543859648|
|0.0| 0.24561403508771928|
|0.0|  0.2631578947368421|
|0.0|  0.2807017543859649|
|0.0|  0.2982456140350877|
|0.0|  0.3157894736842105|
|0.0|  0.3333333333333333|
+---+--------------------+
only showing top 20 rows

areaUnderROC: 1.0


#### 混淆矩阵

In [21]:
# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

LogisticRegression_d12268e226af

In [22]:
fMeasure

threshold,F-Measure
0.7845860015371142,0.0344827586206896
0.7843193344168922,0.0677966101694915
0.7842976092510131,0.1
0.7842531051133191,0.1311475409836065
0.7835792429453297,0.1612903225806451
0.7835223585829078,0.1904761904761905
0.783284563364102,0.21875
0.7832449070254992,0.2461538461538461
0.7830630257264691,0.2727272727272727
0.7830068256743365,0.2985074626865671


In [25]:
maxFMeasure

Row(max(F-Measure)=1.0)

In [26]:
bestThreshold

0.5585022394278357

### 决策树

#### 导入数据

In [28]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").option("numFeatures", "780").load(r"D:\spark\data\mllib\sample_libsvm_data.txt")

#### 标签向量化

In [31]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

#### 特征向量化

In [32]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

#### 拆分训练与测试集

In [37]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])


#### 建立pipeline需要的数据

In [35]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

#### 训练模型

In [38]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

#### 做预测

In [46]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "probability").show(,truncate=False)

+----------+------------+-----------+
|prediction|indexedLabel|probability|
+----------+------------+-----------+
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|0.0       |1.0         |[1.0,0.0]  |
|0.0       |1.0         |[1.0,0.0]  |
|1.0       |1.0         |[0.0,1.0]  |
|0.0       |0.0         |[1.0,0.0]  |
+----------+------------+-----------+
only showing top 15 rows



In [43]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- indexedLabel: double (nullable = false)
 |-- indexedFeatures: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



#### 计算精确度

In [47]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.0625 


In [48]:
treeModel = model.stages[2]
# summary only
print(treeModel)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e9633079c0d3, depth=1, numNodes=3, numClasses=2, numFeatures=780


### 随机森林

#### 导入数据集

In [49]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").option("numFeatures", "780").load(r"D:\spark\data\mllib\sample_libsvm_data.txt")

#### 标签向量化

In [50]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

#### 特征向量化

In [51]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

#### 数据拆分

In [52]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#### 模型实例化

In [55]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

#### 转换索引标签到原始标签

In [56]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

#### 建立pipeline

In [57]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

#### 训练模型

In [58]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

#### 模型预测

In [60]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(780,[95,96,97,12...|
|           0.0|  0.0|(780,[98,99,100,1...|
|           1.0|  0.0|(780,[100,101,102...|
|           0.0|  0.0|(780,[124,125,126...|
|           0.0|  0.0|(780,[124,125,126...|
+--------------+-----+--------------------+
only showing top 5 rows



#### 效果评估

In [61]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0322581


#### 打印模型描述

In [62]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_24c5987ae81b, numTrees=10, numClasses=2, numFeatures=780


### GBDT

#### 导入苏韩剧

In [63]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").option("numFeatures", "780").load(r"D:\spark\data\mllib\sample_libsvm_data.txt")

#### 标签向量化

In [64]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

#### 特征向量化

In [65]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

#### 切分训练与预测

In [66]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#### 训练模型

In [67]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

#### 建立pipline

In [69]:
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

#### 模型训练

In [70]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

#### 模型预测

In [71]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(780,[98,99,100,1...|
|       1.0|         1.0|(780,[121,122,123...|
|       1.0|         1.0|(780,[122,123,124...|
|       1.0|         1.0|(780,[123,124,125...|
|       1.0|         1.0|(780,[124,125,126...|
+----------+------------+--------------------+
only showing top 5 rows



#### 效果评估

In [73]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0833333


#### 打印模型参数

In [74]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel: uid = GBTClassifier_0ade67c2136d, numTrees=10, numClasses=2, numFeatures=780
