### 导入数据

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars xgboost4j-spark_2.12-1.5.2.jar,xgboost4j_2.12-1.5.2.jar pyspark-shell'
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
conf.set("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [2]:
from pyspark.ml.wrapper import JavaPredictionModel

In [3]:
import findspark
findspark.init()

In [4]:
spark.sparkContext.addPyFile(r"D:\spark\sparkxgb.zip")

In [5]:
from sklearn import datasets
import pandas as pd
import logging
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pyspark.ml.tuning as tune
from pyspark.ml import Transformer,Pipeline
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

cancer=datasets.load_breast_cancer()
X,y=cancer.data,cancer.target
columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24','f25','f26','f27','f28','f29','f30']
df=pd.concat([pd.DataFrame(X,columns= columns),pd.DataFrame(y,columns=['label'])],axis=1)

In [6]:
spark_df  = spark.createDataFrame(df.values.tolist(), df.columns.tolist())

In [7]:
spark_df.show(5,truncate=False)

+-----+-----+-----+------+-------+-------+------+-------+------+-------+------+------+-----+-----+--------+-------+-------+-------+-------+--------+-----+-----+-----+------+------+------+------+------+------+-------+-----+
|f1   |f2   |f3   |f4    |f5     |f6     |f7    |f8     |f9    |f10    |f11   |f12   |f13  |f14  |f15     |f16    |f17    |f18    |f19    |f20     |f21  |f22  |f23  |f24   |f25   |f26   |f27   |f28   |f29   |f30    |label|
+-----+-----+-----+------+-------+-------+------+-------+------+-------+------+------+-----+-----+--------+-------+-------+-------+-------+--------+-----+-----+-----+------+------+------+------+------+------+-------+-----+
|17.99|10.38|122.8|1001.0|0.1184 |0.2776 |0.3001|0.1471 |0.2419|0.07871|1.095 |0.9053|8.589|153.4|0.006399|0.04904|0.05373|0.01587|0.03003|0.006193|25.38|17.33|184.6|2019.0|0.1622|0.6656|0.7119|0.2654|0.4601|0.1189 |0.0  |
|20.57|17.77|132.9|1326.0|0.08474|0.07864|0.0869|0.07017|0.1812|0.05667|0.5435|0.7339|3.398|74.08|0.005225|0

### 特征向量化

In [None]:
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")

### 建立pipeline

In [None]:
pipeline = Pipeline(stages=[vecAssembler])
pipelineFit = pipeline.fit(spark_df)

### 对数据集做转换，增加特征列

In [None]:
dataset = pipelineFit.transform(spark_df)

### 拆分训练与测试

In [None]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

### 模型训练

In [None]:
# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

### 模型预测

In [None]:
# 模型预测
prediction = lrModel.transform(testData)

### 模型效果评估

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability')
print('areaUnderROC:', evaluator.evaluate(prediction, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction.filter(prediction.label == prediction.prediction).count()/prediction.count())


### 进一步调参

In [None]:
# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1,0.2,0.25,0.3,0.35,0.4, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)
cvModel = cv.fit(trainingData)

### 输出参数

In [None]:
results = [
    ([
        {key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
    ], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
]
 
sorted(results, key=lambda el:el[1], reverse=True)[0]

### 运用上述参数再次训练

In [None]:
lr_new = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0)
lrModel_new = lr_new.fit(trainingData)

### 模型预测与效果评估

In [None]:
# 模型预测
prediction_new = lrModel_new.transform(testData)
print('areaUnderROC:', evaluator.evaluate(prediction_new, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction_new, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction_new.filter(prediction_new.label == prediction_new.prediction).count()/prediction_new.count())

### 运用随机森林

#### 设置基本参数与网格搜索

In [None]:
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5,10,20,30,40,50,60,70,80,90,100])
                          .addGrid(rf.maxDepth, [3, 5, 7, 10])
                          .addGrid(rf.maxBins, [20, 30, 40])
                          .build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=5)
cvModel_rf = cv.fit(trainingData)
 

#### 评估指标

In [None]:
# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)

In [None]:
print('areaUnderROC:', evaluator.evaluate(predictions, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',predictions.filter(predictions.label == predictions.prediction).count()/predictions.count())

#### 获得最有参数

In [None]:
results = [
    ([
        {key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
    ], metric) for params, metric in zip(cvModel_rf.getEstimatorParamMaps(), cvModel_rf.avgMetrics)
]
 
sorted(results, key=lambda el:el[1], reverse=True)[0]


#### 重新训练

In [None]:
rf_new = RandomForestClassifier(numTrees=60, maxDepth=7, maxBins=30, labelCol="label", seed=123)
rfModel_new = rf_new.fit(trainingData)

In [None]:
# 模型预测
rf_prediction_new = rfModel_new.transform(testData)
print('areaUnderROC:', evaluator.evaluate(rf_prediction_new, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(rf_prediction_new, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',rf_prediction_new.filter(rf_prediction_new.label == rf_prediction_new.prediction).count()/rf_prediction_new.count())

#### 混淆矩阵

In [None]:
tp = rf_prediction_new[(rf_prediction_new.label == 1) & (rf_prediction_new.prediction == 1)].count()
tn = rf_prediction_new[(rf_prediction_new.label == 0) & (rf_prediction_new.prediction == 0)].count()
fp = rf_prediction_new[(rf_prediction_new.label == 0) & (rf_prediction_new.prediction == 1)].count()
fn = rf_prediction_new[(rf_prediction_new.label == 1) & (rf_prediction_new.prediction == 0)].count()
print ('accuracy is : %f'%((tp+tn)/(tp+tn+fp+fn))) #准确率
print ('recall is : %f'%((tp)/(tp+fn))) #召回率
print ('precision is : %f'%((tp)/(tp+fp))) #精确率

In [None]:
rf_prediction_new.printSchema()

In [None]:
rf_prediction_new.show(5,truncate=False)

### XGBoostClassifier

#### 因为无法导入sparkxgboost，暂时无法实现

#### 建立数据集

In [None]:
spark_df  = spark.createDataFrame(df.values.tolist(), df.columns.tolist())

#### 建立特征索引

In [None]:
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")

#### 创建管道命令

In [None]:
pipeline = Pipeline(stages=[vecAssembler])
pipelineFit = pipeline.fit(spark_df)
dataset = pipelineFit.transform(spark_df)

#### 设置训练与测试集

In [None]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

#### 模型训练

In [None]:

# 创建模型
xgb = XGBoostClassifier(featuresCol = 'features', labelCol = 'label',predictionCol='prediction',missing=0.0,numRound=50,numWorkers=10)
preModel = xgb.fit(trainingData)

In [None]:
out1 = preModel.transform(testData)