In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [6]:
df = spark.read.csv('tips.csv',header=True,inferSchema=True)

In [7]:
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [8]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [9]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [10]:
# Handling Categorical Feature
from pyspark.ml.feature import StringIndexer

In [12]:
# for singal column
# indexer = StringIndexer(inputCol='sex',outputCol='sex_inx')
# for multipal columns
indexer = StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'],outputCols=['sex_id', 'smoker_id', 'day_id', 'time_id'])

In [13]:
trfm_df = indexer.fit(df).transform(df)
trfm_df.show()

+----------+----+------+------+---+------+----+------+---------+------+-------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_id|smoker_id|day_id|time_id|
+----------+----+------+------+---+------+----+------+---------+------+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   1.0|      0.0|   1.0|    0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   0.0|      0.0|   1.0|    0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   1.0|      0.0|   1.0|    0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|   0.0|      0.0|   1.0|    0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|   0.0|      0.0|   1.0|    0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|   0.0|      0.0|   1.0|    0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|   0.0|      0.0|   1.0|    0.0|
|     14.78|3.23|  Male|    No|Sun|Dinne

In [14]:
trfm_df.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_id',
 'smoker_id',
 'day_id',
 'time_id']

In [15]:
from pyspark.ml.feature import VectorAssembler

In [16]:
featureAssembler = VectorAssembler(inputCols=['total_bill','size',
 'sex_id',
 'smoker_id',
 'day_id',
 'time_id'], outputCol='ipFeatures')

In [17]:
output = featureAssembler.transform(trfm_df)
output.show()

+----------+----+------+------+---+------+----+------+---------+------+-------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_id|smoker_id|day_id|time_id|          ipFeatures|
+----------+----+------+------+---+------+----+------+---------+------+-------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   1.0|      0.0|   1.0|    0.0|[16.99,2.0,1.0,0....|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|[10.34,3.0,0.0,0....|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|[21.01,3.0,0.0,0....|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   0.0|      0.0|   1.0|    0.0|[23.68,2.0,0.0,0....|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   1.0|      0.0|   1.0|    0.0|[24.59,4.0,1.0,0....|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|   0.0|      0.0|   1.0|    0.0|[25.29,4.0,0.0,0....|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|   0.0|      0.0|   1.0|    0.0|[8.77,2.0,0.

In [18]:
finel = output.select(['ipFeatures','tip'])

In [20]:
finel.show()

+--------------------+----+
|          ipFeatures| tip|
+--------------------+----+
|[16.99,2.0,1.0,0....|1.01|
|[10.34,3.0,0.0,0....|1.66|
|[21.01,3.0,0.0,0....| 3.5|
|[23.68,2.0,0.0,0....|3.31|
|[24.59,4.0,1.0,0....|3.61|
|[25.29,4.0,0.0,0....|4.71|
|[8.77,2.0,0.0,0.0...| 2.0|
|[26.88,4.0,0.0,0....|3.12|
|[15.04,2.0,0.0,0....|1.96|
|[14.78,2.0,0.0,0....|3.23|
|[10.27,2.0,0.0,0....|1.71|
|[35.26,4.0,1.0,0....| 5.0|
|[15.42,2.0,0.0,0....|1.57|
|[18.43,4.0,0.0,0....| 3.0|
|[14.83,2.0,1.0,0....|3.02|
|[21.58,2.0,0.0,0....|3.92|
|[10.33,3.0,1.0,0....|1.67|
|[16.29,3.0,0.0,0....|3.71|
|[16.97,3.0,1.0,0....| 3.5|
|(6,[0,1],[20.65,3...|3.35|
+--------------------+----+
only showing top 20 rows



In [21]:
from pyspark.ml.regression import LinearRegression

In [23]:
train_data,test_data = finel.randomSplit([0.75,0.25])
regreressor = LinearRegression(featuresCol='ipFeatures',labelCol='tip')
regreressor = regreressor.fit(train_data)

In [25]:
regreressor.coefficients

DenseVector([0.0869, 0.211, -0.0777, -0.1879, 0.1234, -0.2137])

In [26]:
regreressor.intercept

0.7506843400532649

In [27]:
result = regreressor.evaluate(test_data)

In [28]:
result.predictions.show()



+--------------------+----+------------------+
|          ipFeatures| tip|        prediction|
+--------------------+----+------------------+
|(6,[0,1],[13.28,2...|2.72|2.3272236568266216|
|(6,[0,1],[16.04,3...|2.24| 2.778152255571271|
|(6,[0,1],[16.31,3...| 2.0| 2.801627020030897|
|(6,[0,1],[17.59,3...|2.64|2.9129147922839373|
|(6,[0,1],[17.78,2...|3.27| 2.718469731153716|
|(6,[0,1],[21.7,2.0])| 4.3|3.0592885336786515|
|(6,[0,1],[48.27,4...|6.73| 5.791307923298028|
|[7.51,2.0,0.0,0.0...| 2.0|1.8585825496907775|
|[7.56,2.0,0.0,0.0...|1.44|1.8629297282944122|
|[9.94,2.0,0.0,0.0...|1.56|2.1602046688081176|
|[10.09,2.0,1.0,1....| 2.0|1.9406417721288283|
|[10.27,2.0,0.0,0....|1.71|2.1888960475921047|
|[10.29,2.0,1.0,0....| 2.6| 2.112886478217417|
|[10.59,2.0,1.0,1....|1.61| 1.827717711737335|
|[10.65,2.0,1.0,0....| 1.5|2.0538369251828756|
|[12.16,2.0,0.0,1....| 2.2| 2.198363407135433|
|[12.46,2.0,0.0,0....| 1.5| 2.626047555839836|
|[12.54,2.0,0.0,0....| 2.5|2.3862579561971056|
|[12.6,2.0,0.

In [29]:
result.r2

0.556056329073833

In [30]:
result.meanAbsoluteError

0.7462382570443066

In [31]:
result.meanSquaredError

1.0913192677714059