<a href="https://colab.research.google.com/github/SongZhou-Meg/elasticsearch/blob/main/PySpark_DataFrame_SQL_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [5]:
# Load the csv into a dataframe
titanic_df = spark.read.csv("train.csv", header=True, inferSchema=True)
titanic_df

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Tim...",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C


# New Section

In [6]:
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
titanic_df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


In [8]:
titanic_df.select('PassengerId', 'Survived')

PassengerId,Survived
1,0
2,1
3,1
4,1
5,0
6,0
7,0
8,0
9,1
10,1


In [9]:
titanic_df.where((titanic_df.Age > 25) & (titanic_df.Survived == 1)).limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S
12,1,1,"Bonnell, Miss. El...",female,58.0,0,0,113783,26.55,C103,S


In [10]:
titanic_df.agg({'Fare':'avg'})

avg(Fare)
32.2042079685746


In [11]:
titanic_df.groupBy('Pclass').agg({'Fare':'avg'}).orderBy('Pclass', ascending=False)

Pclass,avg(Fare)
3,13.675550101832997
2,20.66218315217391
1,84.15468749999992


In [12]:
titanic_df.filter(titanic_df.Age > 25).agg({'Fare':'avg'})

avg(Fare)
37.61960169491524


In [13]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def round_float_down(x):
  return int(x)

round_float_down_udf = udf(round_float_down, IntegerType())

titanic_df.select('PassengerId', 'Fare', round_float_down_udf('Fare').alias('Fare Rounded Down')).limit(5)

PassengerId,Fare,Fare Rounded Down
1,7.25,7
2,71.2833,71
3,7.925,7
4,53.1,53
5,8.05,8


In [14]:
titanic_df.createOrReplaceTempView("Titanic")

In [15]:
spark.sql('select * from Titanic')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Tim...",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C


**New Section **


In [16]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf

spark_conf = SparkConf()\
  .setAppName("YourTest")\
  .setMaster("local[*]")

sc = SparkContext.getOrCreate(spark_conf)

In [17]:
sc

In [18]:

nums = list(range(0, 1000001))
len(nums)

1000001

In [19]:

nums_rdd = sc.parallelize(nums)
nums_rdd 

ParallelCollectionRDD[124] at readRDDFromFile at PythonRDD.scala:274

In [20]:
nums_rdd.collect()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [21]:
nums_rdd.take(5)

[0, 1, 2, 3, 4]

In [22]:
squared_nums_rdd = nums_rdd.map(lambda x:x **2)
squared_nums_rdd.take(5)

[0, 1, 4, 9, 16]

In [23]:
pairs = squared_nums_rdd.map(lambda x: (x, len(str(x))))
pairs.take(25)

[(0, 1),
 (1, 1),
 (4, 1),
 (9, 1),
 (16, 2),
 (25, 2),
 (36, 2),
 (49, 2),
 (64, 2),
 (81, 2),
 (100, 3),
 (121, 3),
 (144, 3),
 (169, 3),
 (196, 3),
 (225, 3),
 (256, 3),
 (289, 3),
 (324, 3),
 (361, 3),
 (400, 3),
 (441, 3),
 (484, 3),
 (529, 3),
 (576, 3)]

In [24]:
even_digit_pairs = pairs.filter(lambda x: (x[1] % 2) == 0)
even_digit_pairs.take(25)

[(16, 2),
 (25, 2),
 (36, 2),
 (49, 2),
 (64, 2),
 (81, 2),
 (1024, 4),
 (1089, 4),
 (1156, 4),
 (1225, 4),
 (1296, 4),
 (1369, 4),
 (1444, 4),
 (1521, 4),
 (1600, 4),
 (1681, 4),
 (1764, 4),
 (1849, 4),
 (1936, 4),
 (2025, 4),
 (2116, 4),
 (2209, 4),
 (2304, 4),
 (2401, 4),
 (2500, 4)]

In [25]:
flipped_pairs = even_digit_pairs.map(lambda x: (x[1], x[0]))
flipped_pairs.take(25)

[(2, 16),
 (2, 25),
 (2, 36),
 (2, 49),
 (2, 64),
 (2, 81),
 (4, 1024),
 (4, 1089),
 (4, 1156),
 (4, 1225),
 (4, 1296),
 (4, 1369),
 (4, 1444),
 (4, 1521),
 (4, 1600),
 (4, 1681),
 (4, 1764),
 (4, 1849),
 (4, 1936),
 (4, 2025),
 (4, 2116),
 (4, 2209),
 (4, 2304),
 (4, 2401),
 (4, 2500)]

In [26]:
grouped = flipped_pairs.groupByKey()
grouped.take(25)

[(2, <pyspark.resultiterable.ResultIterable at 0x7f5ba929f700>),
 (4, <pyspark.resultiterable.ResultIterable at 0x7f5ba929f7f0>),
 (6, <pyspark.resultiterable.ResultIterable at 0x7f5ba929fa60>),
 (8, <pyspark.resultiterable.ResultIterable at 0x7f5ba929fc10>),
 (10, <pyspark.resultiterable.ResultIterable at 0x7f5ba929f730>),
 (12, <pyspark.resultiterable.ResultIterable at 0x7f5ba929fe80>)]

In [27]:
grouped = grouped.map(lambda x : (x[0], list(x[1])))
grouped.take(2)

[(2, [16, 25, 36, 49, 64, 81]),
 (4,
  [1024,
   1089,
   1156,
   1225,
   1296,
   1369,
   1444,
   1521,
   1600,
   1681,
   1764,
   1849,
   1936,
   2025,
   2116,
   2209,
   2304,
   2401,
   2500,
   2601,
   2704,
   2809,
   2916,
   3025,
   3136,
   3249,
   3364,
   3481,
   3600,
   3721,
   3844,
   3969,
   4096,
   4225,
   4356,
   4489,
   4624,
   4761,
   4900,
   5041,
   5184,
   5329,
   5476,
   5625,
   5776,
   5929,
   6084,
   6241,
   6400,
   6561,
   6724,
   6889,
   7056,
   7225,
   7396,
   7569,
   7744,
   7921,
   8100,
   8281,
   8464,
   8649,
   8836,
   9025,
   9216,
   9409,
   9604,
   9801])]

In [28]:
averaged = grouped.map(lambda x : (x[0], sum(x[1]) / len(x[1])))
averaged.collect()

[(2, 45.166666666666664),
 (4, 4675.5),
 (6, 471838.0),
 (8, 47204941.666666664),
 (10, 4720705565.0),
 (12, 472075391214.1667)]

 regression with spark
 

In [29]:
import pandas as pd
import numpy as np
from sklearn import datasets


In [30]:
temp = datasets.load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [31]:
X = temp.data
y = temp.target
temp.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [32]:
temp.feature_names.tolist()

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']

In [33]:
boston_df = pd.DataFrame(X, columns = temp.feature_names)
boston_df['target'] = y

In [34]:
mydf = spark.createDataFrame(boston_df)

In [35]:
mydf.show(5)

+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|target|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98|  24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14|  21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185|61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03|  34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998|45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94|  33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147|54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33|  36.2|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
only showing top 5 rows



In [36]:
mydf.where('ZN == 0').show(5)
mydf.where('ZN == 0').where('NOX == 0.469').show(5)
mydf.groupBy('ZN').count().show(5)

+-------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
|   CRIM| ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|target|
+-------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
|0.02731|0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14|  21.6|
|0.02729|0.0| 7.07| 0.0|0.469|7.185|61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03|  34.7|
|0.03237|0.0| 2.18| 0.0|0.458|6.998|45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94|  33.4|
|0.06905|0.0| 2.18| 0.0|0.458|7.147|54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33|  36.2|
|0.02985|0.0| 2.18| 0.0|0.458| 6.43|58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21|  28.7|
+-------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
only showing top 5 rows

+-------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+
|   CRIM| ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|target|
+-------+---+-----

In [37]:
from pyspark.ml.feature import VectorAssembler

In [38]:
assembler = VectorAssembler(inputCols = temp.feature_names.tolist(), outputCol= 'features')
v_mydf = assembler.transform(mydf)
v_mydf.show(5)


+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+--------------------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|target|            features|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+--------------------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98|  24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14|  21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185|61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03|  34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998|45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94|  33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147|54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33|  36.2|[0.06905,0.0,2.18...|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+------+--------------------+
only showing top 5 

In [39]:
v_mydf = v_mydf.select(['features','target'])
v_mydf.show(5)
(train_df, test_df) = v_mydf.randomSplit([0.7,0.3])

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.00632,18.0,2.3...|  24.0|
|[0.02731,0.0,7.07...|  21.6|
|[0.02729,0.0,7.07...|  34.7|
|[0.03237,0.0,2.18...|  33.4|
|[0.06905,0.0,2.18...|  36.2|
+--------------------+------+
only showing top 5 rows



In [40]:
train_df.show(5)
test_df.show(5)
print(train_df.count())
print(test_df.count())

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.0136,75.0,4.0,...|  18.9|
|[0.01381,80.0,0.4...|  50.0|
|[0.01778,95.0,1.4...|  32.9|
|[0.02009,95.0,2.6...|  50.0|
|[0.02055,85.0,0.7...|  24.7|
+--------------------+------+
only showing top 5 rows

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.00632,18.0,2.3...|  24.0|
|[0.01311,90.0,1.2...|  35.4|
|[0.01432,100.0,1....|  31.6|
|[0.01439,60.0,2.9...|  29.1|
|[0.01951,17.5,1.3...|  33.0|
+--------------------+------+
only showing top 5 rows

341
165


In [52]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [44]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'target', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f5b7c1a6700>
Traceback (most recent call last):
  File "/content/spark-3.1.1-bin-hadoop3.2/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LinearRegression' object has no attribute '_java_obj'


In [45]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-0.06513829700541508,0.011808541369885104,-0.01432080849106297,2.521937309460358,-3.6901952656124446,4.334188509066453,0.0,-0.7088871625755429,0.004162500777967572,0.0,-0.6908239326965988,0.009448553420305685,-0.5886346971726636]
Intercept: 16.858533687150754


In [46]:
lr_model.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x7f5b7b55ddf0>

In [47]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 4.966485
r2: 0.734126


In [48]:
train_df.describe().show()

+-------+------------------+
|summary|            target|
+-------+------------------+
|  count|               341|
|   mean|22.345454545454547|
| stddev|  9.64603018841025|
|    min|               5.0|
|    max|              50.0|
+-------+------------------+



In [49]:
test_df.describe().show()

+-------+-----------------+
|summary|           target|
+-------+-----------------+
|  count|              165|
|   mean|            22.92|
| stddev|8.206178517047904|
|    min|              7.0|
|    max|             50.0|
+-------+-----------------+



In [53]:
lr_predictions = lr_model.transform(test_df)

In [54]:
lr_evaluator = RegressionEvaluator(labelCol="target", predictionCol="prediction", metricName="rmse")
rmse = lr_evaluator.evaluate(lr_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 4.73151


In [50]:
# decision tree regression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'target')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="target", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 4.64495


In [51]:
# gradient-boosted tree regression
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'target', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'target', 'features').show(5)

gbt_evaluator = RegressionEvaluator(
    labelCol="target", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+------+--------------------+
|        prediction|target|            features|
+------------------+------+--------------------+
| 26.23548890391768|  24.0|[0.00632,18.0,2.3...|
| 33.14484291623149|  35.4|[0.01311,90.0,1.2...|
| 31.74994136839635|  31.6|[0.01432,100.0,1....|
|27.267718481699312|  29.1|[0.01439,60.0,2.9...|
|29.812150344937198|  33.0|[0.01951,17.5,1.3...|
+------------------+------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 4.38582
