### Dependencies

In [42]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

### Dataset

#### Initialize Spark Session

In [18]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

spark

In [35]:
training = spark.read.csv('test2.csv', header=True, inferSchema=True)

training.show()

+----+---+----------+----+
|Name|Age|Experience|Wage|
+----+---+----------+----+
|   A| 20|         3|  30|
|   B| 30|         2|  27|
|   C| 40|         5|  45|
|   D| 23|         2|  28|
|   E| 43|         6|  75|
|   F| 27|         2|  32|
+----+---+----------+----+



In [36]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Wage: integer (nullable = true)



In [37]:
featureassembler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol='Independent Variables')

In [38]:
output = featureassembler.transform(training)

In [39]:
output.show()

+----+---+----------+----+---------------------+
|Name|Age|Experience|Wage|Independent Variables|
+----+---+----------+----+---------------------+
|   A| 20|         3|  30|           [20.0,3.0]|
|   B| 30|         2|  27|           [30.0,2.0]|
|   C| 40|         5|  45|           [40.0,5.0]|
|   D| 23|         2|  28|           [23.0,2.0]|
|   E| 43|         6|  75|           [43.0,6.0]|
|   F| 27|         2|  32|           [27.0,2.0]|
+----+---+----------+----+---------------------+



In [41]:
final_data = output.select(['Independent Variables', 'Wage'])

final_data.show()

+---------------------+----+
|Independent Variables|Wage|
+---------------------+----+
|           [20.0,3.0]|  30|
|           [30.0,2.0]|  27|
|           [40.0,5.0]|  45|
|           [23.0,2.0]|  28|
|           [43.0,6.0]|  75|
|           [27.0,2.0]|  32|
+---------------------+----+



### Modeling

In [43]:
train, test = final_data.randomSplit([0.80, 0.20])

In [44]:
regressor = LinearRegression(featuresCol='Independent Variables', labelCol='Wage')

In [45]:
model = regressor.fit(train)

In [48]:
predictions = model.evaluate(test)

predictions.predictions.show()

+---------------------+----+-----------------+
|Independent Variables|Wage|       prediction|
+---------------------+----+-----------------+
|           [30.0,2.0]|  27|35.77103960396052|
|           [40.0,5.0]|  45|65.45792079207926|
+---------------------+----+-----------------+



In [49]:
print(predictions.meanAbsoluteError, predictions.meanSquaredError)

14.61448019801989 247.72882943461647
