In [0]:
# File location and type
file_location = "/FileStore/tables/tips.csv"
file_type = "csv"
df=spark.read.csv(file_location,header=True,inferSchema=True)

In [0]:
df.head(5)

Out[4]: [Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)]

In [0]:
df.describe()

Out[27]: DataFrame[summary: string, total_bill: string, tip: string, sex: string, smoker: string, day: string, time: string, size: string]

In [0]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
df.columns

Out[8]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer=StringIndexer(inputCols=[ 'sex', 'smoker', 'day', 'time'],outputCols=["Sex","Smoker","Day","Time"])
dfnew=indexer.fit(df).transform(df)
dfnew.show()

+----------+----+---+------+---+----+----+
|total_bill| tip|Sex|Smoker|Day|Time|size|
+----------+----+---+------+---+----+----+
|     16.99|1.01|1.0|   0.0|1.0| 0.0|   2|
|     10.34|1.66|0.0|   0.0|1.0| 0.0|   3|
|     21.01| 3.5|0.0|   0.0|1.0| 0.0|   3|
|     23.68|3.31|0.0|   0.0|1.0| 0.0|   2|
|     24.59|3.61|1.0|   0.0|1.0| 0.0|   4|
|     25.29|4.71|0.0|   0.0|1.0| 0.0|   4|
|      8.77| 2.0|0.0|   0.0|1.0| 0.0|   2|
|     26.88|3.12|0.0|   0.0|1.0| 0.0|   4|
|     15.04|1.96|0.0|   0.0|1.0| 0.0|   2|
|     14.78|3.23|0.0|   0.0|1.0| 0.0|   2|
|     10.27|1.71|0.0|   0.0|1.0| 0.0|   2|
|     35.26| 5.0|1.0|   0.0|1.0| 0.0|   4|
|     15.42|1.57|0.0|   0.0|1.0| 0.0|   2|
|     18.43| 3.0|0.0|   0.0|1.0| 0.0|   4|
|     14.83|3.02|1.0|   0.0|1.0| 0.0|   2|
|     21.58|3.92|0.0|   0.0|1.0| 0.0|   2|
|     10.33|1.67|1.0|   0.0|1.0| 0.0|   3|
|     16.29|3.71|0.0|   0.0|1.0| 0.0|   3|
|     16.97| 3.5|1.0|   0.0|1.0| 0.0|   3|
|     20.65|3.35|0.0|   0.0|0.0| 0.0|   3|
+----------

In [0]:
from pyspark.ml.feature import VectorAssembler


In [0]:
assembler=VectorAssembler(inputCols=["tip","Sex","Smoker","Day","Time","size"],outputCol="Features")
dfnew=assembler.transform(dfnew)

In [0]:
dfnew=dfnew.select("total_bill","Features")
dfnew.show()

+----------+--------------------+
|total_bill|            Features|
+----------+--------------------+
|     16.99|[1.01,1.0,0.0,1.0...|
|     10.34|[1.66,0.0,0.0,1.0...|
|     21.01|[3.5,0.0,0.0,1.0,...|
|     23.68|[3.31,0.0,0.0,1.0...|
|     24.59|[3.61,1.0,0.0,1.0...|
|     25.29|[4.71,0.0,0.0,1.0...|
|      8.77|[2.0,0.0,0.0,1.0,...|
|     26.88|[3.12,0.0,0.0,1.0...|
|     15.04|[1.96,0.0,0.0,1.0...|
|     14.78|[3.23,0.0,0.0,1.0...|
|     10.27|[1.71,0.0,0.0,1.0...|
|     35.26|[5.0,1.0,0.0,1.0,...|
|     15.42|[1.57,0.0,0.0,1.0...|
|     18.43|[3.0,0.0,0.0,1.0,...|
|     14.83|[3.02,1.0,0.0,1.0...|
|     21.58|[3.92,0.0,0.0,1.0...|
|     10.33|[1.67,1.0,0.0,1.0...|
|     16.29|[3.71,0.0,0.0,1.0...|
|     16.97|[3.5,1.0,0.0,1.0,...|
|     20.65|(6,[0,5],[3.35,3.0])|
+----------+--------------------+
only showing top 20 rows



In [0]:
from pyspark.ml.regression import LinearRegression
traindata,testdata=dfnew.randomSplit([0.75,0.25])

In [0]:
regressor=LinearRegression(featuresCol='Features', labelCol='total_bill')
regressor=regressor.fit(traindata)

In [0]:
pred=regressor.evaluate(testdata)
pred.predictions.show()

+----------+--------------------+------------------+
|total_bill|            Features|        prediction|
+----------+--------------------+------------------+
|      5.75|[1.0,1.0,1.0,3.0,...|12.446957628979021|
|      8.35|[1.5,1.0,0.0,2.0,...|10.557164470795547|
|       9.6|[4.0,1.0,1.0,1.0,...|22.278532493835574|
|      9.78|[1.73,0.0,0.0,2.0...|12.414378728744122|
|      9.94|[1.56,0.0,0.0,1.0...|13.087747396554589|
|     10.33|[2.0,1.0,0.0,2.0,...|12.103393596405246|
|     11.17|[1.5,1.0,0.0,2.0,...|10.557164470795547|
|     11.35|[2.5,1.0,1.0,3.0,...| 17.08564500580812|
|     11.61|(6,[0,5],[3.39,2.0])|19.024046051885268|
|     11.87|[1.63,1.0,0.0,2.0...|10.959184043454067|
|     12.02|(6,[0,5],[1.97,2.0])|14.632755335153718|
|     12.26|[2.0,1.0,0.0,2.0,...|12.103393596405246|
|     12.46|[1.5,0.0,0.0,3.0,...| 12.34799979028307|
|      13.0|[2.0,1.0,1.0,2.0,...| 14.89452942087901|
|     13.03|[2.0,0.0,0.0,2.0,...| 13.24934245657336|
|     13.51|[2.0,0.0,1.0,2.0,...|16.0404782810

In [0]:
pred.r2,pred.meanAbsoluteError,pred.meanSquaredError

Out[26]: (0.602425802974444, 3.805543310040635, 29.281741547401662)