# This is the linear regression example on the MLlib documentation page. 

## Required data file: sample_linear_regression_data.txt

### Resource: 

### Spark MLlib classification and regression documentation
### https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression


### Step 1: create a spark session

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lrex').getOrCreate()
from pyspark.ml.regression import LinearRegression

### Step 2: read data

In [3]:
all_data = spark.read.format('libsvm').option("numFeatures","10").load('sample_linear_regression_data.txt', header=True)

### Step 3: split data into training set and test set

In [4]:
train_data,test_data = all_data.randomSplit([0.7,0.3])

In [5]:
train_data.describe().show()

+-------+--------------------+
|summary|               label|
+-------+--------------------+
|  count|                 330|
|   mean|-0.19453058780512283|
| stddev|  10.551928302622024|
|    min| -28.571478869743427|
|    max|   27.78383192005107|
+-------+--------------------+



In [6]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                171|
|   mean| 1.1280490871074889|
| stddev|  9.822073891920821|
|    min|-26.805483428483072|
|    max|  23.52945433069272|
+-------+-------------------+



### Step 4: build a linear regression model

In [7]:
lr = LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [8]:
lrModel = lr.fit(train_data)

In [9]:
lrModel.coefficients

DenseVector([-0.7402, 1.2976, -2.1174, 3.3696, 1.2225, 0.5393, 0.4601, -0.2842, 0.1744, 0.9593])

In [10]:
lrModel.intercept

-0.37270194004255297

### Step 5: evaluate the model

In [13]:
training_summary = lrModel.summary
print('RMSE: %f' % training_summary.rootMeanSquaredError)

RMSE: 10.246268


In [14]:
print("r2: %f" % training_summary.r2)

r2: 0.054229


In [15]:
test_results = lrModel.evaluate(test_data)

In [16]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -28.21880330062654|
|-26.598478233001604|
| -20.91064935067119|
| -19.18710416164714|
|-16.154615011657647|
|-12.824004300906523|
|-19.718944831957206|
| -17.28007992664985|
|-13.242860228769123|
|-18.743278801108758|
| -18.20144719909078|
| -11.16875661565154|
| -9.600630873112701|
| -11.14183898259143|
|-17.568243490384546|
|-14.354398291968206|
|-13.014125302376993|
|-13.913541023263381|
|-13.507689426773645|
|-11.514351824782237|
+-------------------+
only showing top 20 rows



### Step 6: test the model

In [17]:
unlabeled_data = test_data.select('features')

In [18]:
predictions = lrModel.transform(unlabeled_data)

In [18]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  -2.994373361212311|
|(10,[0,1,2,3,4,5,...| -0.6266407791305711|
|(10,[0,1,2,3,4,5,...| -0.9599992242663343|
|(10,[0,1,2,3,4,5,...|  1.3386662681665213|
|(10,[0,1,2,3,4,5,...| -1.0626058597096455|
|(10,[0,1,2,3,4,5,...|   1.942706685103939|
|(10,[0,1,2,3,4,5,...|-0.07351608698263443|
|(10,[0,1,2,3,4,5,...|   2.584602412893303|
|(10,[0,1,2,3,4,5,...|  1.2730637825886497|
|(10,[0,1,2,3,4,5,...| -0.3193704495364642|
|(10,[0,1,2,3,4,5,...|  -2.302411537437591|
|(10,[0,1,2,3,4,5,...| -1.6313114020407997|
|(10,[0,1,2,3,4,5,...|  0.5813510767132104|
|(10,[0,1,2,3,4,5,...|  -3.132419285303051|
|(10,[0,1,2,3,4,5,...|  1.5763353678621195|
|(10,[0,1,2,3,4,5,...|0.003621351125055...|
|(10,[0,1,2,3,4,5,...| 0.06266634034898606|
|(10,[0,1,2,3,4,5,...| -2.2479716905876805|
|(10,[0,1,2,3,4,5,...|  1.5895776283074603|
|(10,[0,1,2,3,4,5,...| 0.4755839