### 监督学习算法

#### 线性回归

In [16]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("LinearRegressionWithElasticNet")\
        .getOrCreate()

# 加载数据
training = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")

# 拟合
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(training)

# 输出系数和截距
print("coefficients: ", lr_model.coefficients)
print("intercept: ", lr_model.intercept)

# 模型信息
trainingSummary = lr_model.summary
print('totalIterations: ', trainingSummary.totalIterations)
print('objectiveHistory: ', trainingSummary.objectiveHistory)
print('residuals: ', trainingSummary.residuals.show())
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

coefficients:  [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426]
intercept:  0.1598936844239736
totalIterations:  7
objectiveHistory:  [0.49999999999999994, 0.4967620357443381, 0.4936361664340463, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]
+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053554|
|  -5.204019455758823|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719486|
|  -10.00431602969873|
|   2.062397807050484|
|  3.1117508432954772|
| -15.893608229419382|
|  -5.036284254673026|
|   6.483215876994333|
|  12.429497299109002|
|  -20.32003219007654|
| -2.0049838218725005|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

residuals:  None
RMSE: 10