In [0]:
from pyspark.sql import SparkSession
file_location = "/FileStore/tables/tips.csv"

spark = SparkSession.builder.appName('linear_regression_ml').getOrCreate()
tips_csv = spark.read.csv(file_location, header=True, inferSchema=True)
tips_csv.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [0]:
tips_csv.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
# In the above dataframe, independent features are -> tip, sex, smoker, day, time, size & dependent features are -> total_bill only.
# Now handling categorical features ->
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCols=["sex", "smoker", "day", "time"], outputCols=["sex_indexed", "smoker_indexed", "day_indexed", "time_indexed"])

tips_csv_r = indexer.fit(tips_csv).transform(tips_csv)
tips_csv_r.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [0]:
tips_csv_r.columns

Out[6]: ['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_indexed',
 'smoker_indexed',
 'day_indexed',
 'time_indexed']

In [0]:
from pyspark.ml.feature import VectorAssembler

featureAssembler = VectorAssembler(inputCols=["tip", "size", "sex_indexed", "smoker_indexed", "day_indexed", "time_indexed"], outputCol="Independent Features")
output = featureAssembler.transform(tips_csv_r)
finalized_data = output.select('Independent Features', 'total_bill')

In [0]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="total_bill")
predicted_data = regressor.fit(train_data).evaluate(test_data)

In [0]:
predicted_data.predictions.show()

+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.75,2.0])|     17.82|13.625922479762599|
| (6,[0,1],[3.0,4.0])|     20.45|23.953946986119902|
| (6,[0,1],[5.0,3.0])|     31.27| 26.96418930591419|
|(6,[0,1],[6.73,4.0])|     48.27|35.569173484096694|
|[1.0,1.0,1.0,1.0,...|      3.07| 9.627049593532771|
|[1.0,2.0,1.0,1.0,...|      5.75| 13.59554392080264|
|[1.1,2.0,1.0,1.0,...|      12.9|13.156210932116833|
|[1.44,2.0,0.0,0.0...|      7.56|12.132906310176326|
|[1.5,2.0,1.0,0.0,...|     11.17|11.281190454727888|
|[1.58,2.0,0.0,1.0...|     13.42|15.412056486382623|
|[1.63,2.0,1.0,0.0...|     11.87|11.686010681198928|
|[1.64,2.0,0.0,1.0...|     15.36|15.876327832816232|
|[1.67,3.0,1.0,0.0...|     10.33|15.806251932429273|
|[1.73,2.0,0.0,0.0...|      9.78|13.035966815380954|
|[1.83,1.0,1.0,0.0...|     10.07| 9.091049865240345|
|[1.92,1.0,0.0,1.0...|      8.58|13.2530559143

In [0]:
# Performance Metrics
print((predicted_data.r2, predicted_data.meanAbsoluteError, predicted_data.meanSquaredError))

(0.5900755234997408, 4.816564021279628, 41.10285366699412)
