# PySpark Multiple Linear Regression Implementation
## Starting the Spark Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Liner Regression').getOrCreate()

In [3]:
spark

# Reading Data Set

In [5]:
df_spark = spark.read.csv('Tips.csv',header=True,inferSchema=True)
df_spark.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [7]:
df_spark.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [10]:
df_spark.describe()

DataFrame[summary: string, total_bill: string, tip: string, sex: string, smoker: string, day: string, time: string, size: string]

In [11]:
df_spark.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

## Data Pre-Processing

In [13]:
## Handling categorial feature
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCols=['sex','smoker','day','time'],
                        outputCols=['sex_indexed','smoker_indexed','day_indexed','time_indexed'])
df_r = indexer.fit(df_spark).transform(df_spark)
df_r.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [14]:
# Grouping featues
from pyspark.ml.feature import VectorAssembler

feature_assembler = VectorAssembler(inputCols=['tip','size','sex_indexed','smoker_indexed','day_indexed','time_indexed'],
                        outputCol='Independent_feature')

output = feature_assembler.transform(df_r)

In [15]:
output.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed| Independent_feature|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[3.5,3.0,0.0,0.0,...|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|[3.31,2.0,0.0,0.0...|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|[3.61,4.0,1.0,0.0...|
|     25.29|4.71|  Male|    No|S

In [17]:
output.select('Independent_feature').show()

+--------------------+
| Independent_feature|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
|[4.71,4.0,0.0,0.0...|
|[2.0,2.0,0.0,0.0,...|
|[3.12,4.0,0.0,0.0...|
|[1.96,2.0,0.0,0.0...|
|[3.23,2.0,0.0,0.0...|
|[1.71,2.0,0.0,0.0...|
|[5.0,4.0,1.0,0.0,...|
|[1.57,2.0,0.0,0.0...|
|[3.0,4.0,0.0,0.0,...|
|[3.02,2.0,1.0,0.0...|
|[3.92,2.0,0.0,0.0...|
|[1.67,3.0,1.0,0.0...|
|[3.71,3.0,0.0,0.0...|
|[3.5,3.0,1.0,0.0,...|
|(6,[0,1],[3.35,3.0])|
+--------------------+
only showing top 20 rows



In [18]:
## Final_data
final_data = output.select('Independent_feature','total_bill')
final_data.show()

+--------------------+----------+
| Independent_feature|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
|[4.71,4.0,0.0,0.0...|     25.29|
|[2.0,2.0,0.0,0.0,...|      8.77|
|[3.12,4.0,0.0,0.0...|     26.88|
|[1.96,2.0,0.0,0.0...|     15.04|
|[3.23,2.0,0.0,0.0...|     14.78|
|[1.71,2.0,0.0,0.0...|     10.27|
|[5.0,4.0,1.0,0.0,...|     35.26|
|[1.57,2.0,0.0,0.0...|     15.42|
|[3.0,4.0,0.0,0.0,...|     18.43|
|[3.02,2.0,1.0,0.0...|     14.83|
|[3.92,2.0,0.0,0.0...|     21.58|
|[1.67,3.0,1.0,0.0...|     10.33|
|[3.71,3.0,0.0,0.0...|     16.29|
|[3.5,3.0,1.0,0.0,...|     16.97|
|(6,[0,1],[3.35,3.0])|     20.65|
+--------------------+----------+
only showing top 20 rows



# Model Building

In [20]:
from pyspark.ml.regression import LinearRegression

# Train Test Split
train_data, test_data = final_data.randomSplit([0.75,0.25])
regression = LinearRegression(featuresCol='Independent_feature',labelCol='total_bill')
regression = regression.fit(train_data)

In [22]:
regression.coefficients

DenseVector([3.4074, 3.2967, -0.702, 2.6008, -0.2327, -1.2318])

In [23]:
regression.intercept

0.9718060540241152

In [26]:
## Prediction
prediction = regression.evaluate(test_data)

In [27]:
# Final Prediction Column
prediction.predictions.show()

+--------------------+----------+------------------+
| Independent_feature|total_bill|        prediction|
+--------------------+----------+------------------+
| (6,[0,1],[2.0,2.0])|     12.69|  14.3798439718735|
|(6,[0,1],[2.01,2.0])|     20.23|14.413917572494936|
|(6,[0,1],[2.31,3.0])|     18.69|18.732784487918984|
|(6,[0,1],[3.27,2.0])|     17.78|  18.7071912507961|
|(6,[0,1],[3.35,3.0])|     20.65|22.276438952548514|
| (6,[0,1],[3.6,3.0])|     24.06| 23.12827896808446|
|(6,[0,1],[3.76,2.0])|     18.24| 20.37679768124655|
| (6,[0,1],[5.0,3.0])|     31.27| 27.89858305508575|
|(6,[0,1],[5.92,3.0])|     29.03| 31.03335431225803|
|(6,[0,1],[7.58,4.0])|     39.42| 39.98623091219761|
|[1.36,3.0,1.0,0.0...|     18.64|13.096656176293882|
|[1.44,2.0,0.0,0.0...|      7.56|10.774560383871886|
|[1.5,2.0,0.0,0.0,...|     19.08|10.979001987600514|
|[1.5,2.0,0.0,1.0,...|     15.69|15.044282648296708|
|[1.5,2.0,0.0,1.0,...|     12.03|14.578910326856997|
|[1.5,2.0,1.0,0.0,...|     26.41|11.9741896414

In [28]:
prediction.r2

0.4476052662356732

In [30]:
prediction.meanAbsoluteError

3.821602421653584