In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
df=spark.read.csv('linear_reg.csv',inferSchema=True,header=True)

In [4]:
print((df.count(), len(df.columns)))

(1150, 6)


In [5]:
df.printSchema()

root
 |-- var1: integer (nullable = true)
 |-- var2: integer (nullable = true)
 |-- var3: integer (nullable = true)
 |-- var4: double (nullable = true)
 |-- var5: double (nullable = true)
 |-- result: double (nullable = true)



In [6]:
df.describe().show()

+-------+-----------------+-----------------+------------------+--------------------+--------------------+--------------------+
|summary|             var1|             var2|              var3|                var4|                var5|              result|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+--------------------+
|  count|             1150|             1150|              1150|                1150|                1150|                1150|
|   mean|643.6973913043478|644.2704347826087| 71.62608695652175|  0.3191382608695646| 0.15598347826086953|  0.3919947826086955|
| stddev|63.10662570038112|64.78680908019999|13.753493658264567|0.030429290211964388|0.010494753386318759|0.034980138199992294|
|    min|              460|              461|                40|               0.161|                0.11|               0.301|
|    max|             1009|             1103|               116|               0.369|               0.19

In [7]:
df.head(3)

[Row(var1=634, var2=666, var3=61, var4=0.316, var5=0.159, result=0.416),
 Row(var1=600, var2=600, var3=94, var4=0.31, var5=0.146, result=0.369),
 Row(var1=611, var2=605, var3=93, var4=0.311, var5=0.146, result=0.416)]

In [8]:
from pyspark.sql.functions import corr

In [10]:
df.select(corr('var1','result')).show()

+------------------+
|corr(var1, result)|
+------------------+
| 0.512530311949889|
+------------------+



In [11]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [12]:
df.columns

['var1', 'var2', 'var3', 'var4', 'var5', 'result']

In [13]:
vec_assmebler=VectorAssembler(inputCols=['var1', 'var2', 'var3', 'var4', 'var5'],outputCol='features')

In [14]:
features_df=vec_assmebler.transform(df)

In [15]:
features_df.printSchema()

root
 |-- var1: integer (nullable = true)
 |-- var2: integer (nullable = true)
 |-- var3: integer (nullable = true)
 |-- var4: double (nullable = true)
 |-- var5: double (nullable = true)
 |-- result: double (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
features_df.select('features').show(2,truncate=False)

+------------------------------+
|features                      |
+------------------------------+
|[634.0,666.0,61.0,0.316,0.159]|
|[600.0,600.0,94.0,0.31,0.146] |
+------------------------------+
only showing top 2 rows



In [17]:
model_df=features_df.select('features','result')

In [18]:
model_df.show(5,False)

+------------------------------+------+
|features                      |result|
+------------------------------+------+
|[634.0,666.0,61.0,0.316,0.159]|0.416 |
|[600.0,600.0,94.0,0.31,0.146] |0.369 |
|[611.0,605.0,93.0,0.311,0.146]|0.416 |
|[634.0,606.0,69.0,0.315,0.16] |0.415 |
|[613.0,659.0,61.0,0.301,0.14] |0.366 |
+------------------------------+------+
only showing top 5 rows



In [19]:
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [21]:
train_df.describe().show()

+-------+-------------------+
|summary|             result|
+-------+-------------------+
|  count|                806|
|   mean| 0.3923002481389583|
| stddev|0.03448916185778715|
|    min|               0.31|
|    max|              0.469|
+-------+-------------------+



In [22]:
lin_reg=LinearRegression(featuresCol='features',labelCol='result')

In [23]:
lr_model=lin_reg.fit(train_df)

In [24]:
print('{}{}'.format('方程截距:',lr_model.intercept)) 
print('{}{}'.format('方程参数系数:',lr_model.coefficients))  # 回归方程中的中自变量的系数

方程截距:-0.0448741221007
方程参数系数:[6.201299484897484e-05,0.00010179604468040109,0.0004519575988047111,0.1427404941053289,1.6236235191044002]


In [25]:
print(lr_model.coefficients)

[6.201299484897484e-05,0.00010179604468040109,0.0004519575988047111,0.1427404941053289,1.6236235191044002]


In [26]:
training_predictions=lr_model.evaluate(train_df)

In [27]:
training_predictions.meanSquaredError
print('{}{}'.format('均方误差:',training_predictions.meanSquaredError))            # 误差值差值平方   
print('{}{}'.format('R2判定系数：',training_predictions.r2 ))  # r2 判定系数,用来判定，构建的模型是否能够准确的预测,越大说明预测的准确率越高

均方误差:0.000493787041997
R2判定系数：0.584363604835


In [28]:
training_predictions.r2

0.5843636048352541

In [29]:
test_results=lr_model.evaluate(test_df)

In [30]:
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
| 0.04057100269836972|
|0.020256134733585784|
| 0.01987874430544695|
|-0.01024864302487...|
|0.003389702064237...|
|0.004924344039997364|
|0.024724843364441307|
| 0.04313982591419813|
|0.012915417413048857|
|-0.00913028485932...|
+--------------------+
only showing top 10 rows



In [31]:
test_results.r2

0.5495490308189562

In [32]:
test_results.rootMeanSquaredError

0.024223963230906198

In [33]:
test_results.meanSquaredError

0.0005868003946122955