In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [9]:
training = spark.read.csv('test4.csv',header=True,inferSchema=True)
training.show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|  ali| 23|         5| 50000|
|abbas| 21|         4| 40000|
|yasin| 20|         5| 30000|
| wali| 23|         6| 20000|
|jawed| 30|         7| 25000|
|shura| 40|         9| 60000|
| bula| 60|        18| 90000|
+-----+---+----------+------+



In [13]:
#the following is gonna combine two features into one feature
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age','Experience'], outputCol='Independent Features')

In [12]:
#apply the assembler to the dataframe
output = featureassembler.transform(training)
output.show()

+-----+---+----------+------+--------------------+
| Name|Age|Experience|Salary|Independent Features|
+-----+---+----------+------+--------------------+
|  ali| 23|         5| 50000|          [23.0,5.0]|
|abbas| 21|         4| 40000|          [21.0,4.0]|
|yasin| 20|         5| 30000|          [20.0,5.0]|
| wali| 23|         6| 20000|          [23.0,6.0]|
|jawed| 30|         7| 25000|          [30.0,7.0]|
|shura| 40|         9| 60000|          [40.0,9.0]|
| bula| 60|        18| 90000|         [60.0,18.0]|
+-----+---+----------+------+--------------------+



In [17]:
finalized_data = output.select("Independent Features",'Salary')
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,5.0]| 50000|
|          [21.0,4.0]| 40000|
|          [20.0,5.0]| 30000|
|          [23.0,6.0]| 20000|
|          [30.0,7.0]| 25000|
|          [40.0,9.0]| 60000|
|         [60.0,18.0]| 90000|
+--------------------+------+



In [27]:
from pyspark.ml.regression import LinearRegression
#train test split
training_data,test_data = finalized_data.randomSplit([0.75,0.25])
#the feature which is used for the prediction of the salary is Independent Features column
regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'Salary')
regressor = regressor.fit(training_data)

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [21.0,4.0]| 40000|
|          [23.0,5.0]| 50000|
|          [23.0,6.0]| 20000|
|          [30.0,7.0]| 25000|
|          [40.0,9.0]| 60000|
|         [60.0,18.0]| 90000|
+--------------------+------+



In [30]:
regressor.coefficients

DenseVector([2353.7647, -2695.1491])

In [29]:
regressor.intercept

-7771.555795437236

In [31]:
#see how accurately it predicts
pred_result = regressor.evaluate(test_data)
pred_result.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [20.0,5.0]| 30000|25827.992173796512|
+--------------------+------+------------------+



In [32]:
pred_result.meanAbsoluteError,pred_result.meanSquaredError

(4172.007826203488, 17405649.30190315)