In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('PySparkML').getOrCreate()

In [3]:
## Read dataset

training=spark.read.csv('employee-salary.csv',header=True,inferSchema=True)

In [4]:
training.show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|     Emma| 27| 60000|         3|
|  Michael| 30| 45000|         7|
|   Sophia| 28| 55000|         6|
|   Oliver| 35| 70000|        10|
|      Ava| 25| 45000|         2|
|  William| 40| 65000|        12|
|Charlotte| 22| 40000|         1|
|    David| 45| 80000|        13|
+---------+---+------+----------+



In [5]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [6]:
training.columns

['Name', 'Age', 'Salary', 'Experience']

## In ML we do train-test-split but in pyspark we have to have the feature in group to make it independent feature

like 

[Age,Expeirence] ---> new feature ---> independent feature

These feature grouped as new feature which are called as independent feature

In [7]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Age","Experience"],outputCol='independent feature')

In [8]:
output=featureassembler.transform(training)

In [9]:
output.show()

+---------+---+------+----------+-------------------+
|     Name|Age|Salary|Experience|independent feature|
+---------+---+------+----------+-------------------+
|     John| 25| 50000|         5|         [25.0,5.0]|
|     Emma| 27| 60000|         3|         [27.0,3.0]|
|  Michael| 30| 45000|         7|         [30.0,7.0]|
|   Sophia| 28| 55000|         6|         [28.0,6.0]|
|   Oliver| 35| 70000|        10|        [35.0,10.0]|
|      Ava| 25| 45000|         2|         [25.0,2.0]|
|  William| 40| 65000|        12|        [40.0,12.0]|
|Charlotte| 22| 40000|         1|         [22.0,1.0]|
|    David| 45| 80000|        13|        [45.0,13.0]|
+---------+---+------+----------+-------------------+



In [10]:
output.columns

['Name', 'Age', 'Salary', 'Experience', 'independent feature']

In [11]:
finalized_data=output.select("independent feature","Salary")

In [12]:
finalized_data.show()

+-------------------+------+
|independent feature|Salary|
+-------------------+------+
|         [25.0,5.0]| 50000|
|         [27.0,3.0]| 60000|
|         [30.0,7.0]| 45000|
|         [28.0,6.0]| 55000|
|        [35.0,10.0]| 70000|
|         [25.0,2.0]| 45000|
|        [40.0,12.0]| 65000|
|         [22.0,1.0]| 40000|
|        [45.0,13.0]| 80000|
+-------------------+------+



In [14]:
from pyspark.ml.regression import LinearRegression
## Train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='independent feature', labelCol='Salary')

In [15]:
regressor=regressor.fit(train_data)

In [16]:
## Coefficients
regressor.coefficients

DenseVector([1038.8453, 1102.9936])

In [18]:
## Intercept

regressor.intercept

15956.878118321734

In [19]:
## Prediction

pred_results = regressor.evaluate(test_data)

In [20]:
pred_results.predictions.show()

+-------------------+------+------------------+
|independent feature|Salary|        prediction|
+-------------------+------+------------------+
|         [25.0,2.0]| 45000|44133.998574482495|
|         [27.0,3.0]| 60000| 47314.68282252239|
+-------------------+------+------------------+



In [21]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(6775.659301497559, 80833615.18110348)