In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MachineLearningModel").getOrCreate()
spark

In [2]:
data = spark.read.csv("data/experienceSalary.csv", header=True, inferSchema=True)
data

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [4]:
data.show() 

+----------------+---+----------+------+
|            Name|Age|Experience|Salary|
+----------------+---+----------+------+
|        John Doe| 30|         5| 70000|
|      Jane Smith| 28|         3| 60000|
| Michael Johnson| 35|         8| 90000|
|     Emily Davis| 25|         2| 50000|
| Christopher Lee| 40|        12|120000|
|  Samantha White| 32|         6| 75000|
|      Daniel Kim| 28|         4| 65000|
|Amanda Rodriguez| 27|         3| 58000|
|     David Brown| 38|        10|100000|
|     Emma Taylor| 29|         5| 72000|
|    Joshua Moore| 33|         7| 80000|
|   Olivia Harris| 26|         3| 55000|
|  Matthew Turner| 45|        15|130000|
|   Sophia Miller| 31|         6| 72000|
|   Andrew Wilson| 34|         8| 85000|
|  Grace Martinez| 23|         1| 48000|
|  Robert Jackson| 36|         9| 95000|
|    Ava Anderson| 30|         4| 68000|
|    Ethan Taylor| 29|         6| 72000|
|   Isabella Hall| 27|         5| 70000|
+----------------+---+----------+------+



In [4]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Stating ML Algorithm

#### 1- Group the input features

In [5]:
inputColumns = ['Age','Experience']

#### 2- Import and group the input features that we will use

In [6]:
from pyspark.ml.feature import VectorAssembler
fa = VectorAssembler(inputCols=inputColumns, outputCol="IndependentFeatures")
fa

VectorAssembler_d630e54c4889

In [8]:
output = fa.transform(data)
output.show()

+----------------+---+----------+------+-------------------+
|            Name|Age|Experience|Salary|IndependentFeatures|
+----------------+---+----------+------+-------------------+
|        John Doe| 30|         5| 70000|         [30.0,5.0]|
|      Jane Smith| 28|         3| 60000|         [28.0,3.0]|
| Michael Johnson| 35|         8| 90000|         [35.0,8.0]|
|     Emily Davis| 25|         2| 50000|         [25.0,2.0]|
| Christopher Lee| 40|        12|120000|        [40.0,12.0]|
|  Samantha White| 32|         6| 75000|         [32.0,6.0]|
|      Daniel Kim| 28|         4| 65000|         [28.0,4.0]|
|Amanda Rodriguez| 27|         3| 58000|         [27.0,3.0]|
|     David Brown| 38|        10|100000|        [38.0,10.0]|
|     Emma Taylor| 29|         5| 72000|         [29.0,5.0]|
|    Joshua Moore| 33|         7| 80000|         [33.0,7.0]|
|   Olivia Harris| 26|         3| 55000|         [26.0,3.0]|
|  Matthew Turner| 45|        15|130000|        [45.0,15.0]|
|   Sophia Miller| 31|  

In [9]:
final_data = output.select("IndependentFeatures", "Salary")
final_data.show()

+-------------------+------+
|IndependentFeatures|Salary|
+-------------------+------+
|         [30.0,5.0]| 70000|
|         [28.0,3.0]| 60000|
|         [35.0,8.0]| 90000|
|         [25.0,2.0]| 50000|
|        [40.0,12.0]|120000|
|         [32.0,6.0]| 75000|
|         [28.0,4.0]| 65000|
|         [27.0,3.0]| 58000|
|        [38.0,10.0]|100000|
|         [29.0,5.0]| 72000|
|         [33.0,7.0]| 80000|
|         [26.0,3.0]| 55000|
|        [45.0,15.0]|130000|
|         [31.0,6.0]| 72000|
|         [34.0,8.0]| 85000|
|         [23.0,1.0]| 48000|
|         [36.0,9.0]| 95000|
|         [30.0,4.0]| 68000|
|         [29.0,6.0]| 72000|
|         [27.0,5.0]| 70000|
+-------------------+------+



### Train and Split the data

In [10]:
from pyspark.ml.regression import LinearRegression

In [11]:
train_data, test_data = final_data.randomSplit([0.75, 0.25])
regressor=LinearRegression(featuresCol='IndependentFeatures', labelCol='Salary')
regressor=regressor.fit(train_data)
regressor

LinearRegressionModel: uid=LinearRegression_db026570f835, numFeatures=2

In [22]:
train_data.show(), test_data.show()

+-------------------+------+
|IndependentFeatures|Salary|
+-------------------+------+
|         [23.0,1.0]| 48000|
|         [26.0,3.0]| 55000|
|         [27.0,3.0]| 58000|
|         [28.0,4.0]| 65000|
|         [29.0,5.0]| 72000|
|         [29.0,6.0]| 72000|
|         [30.0,4.0]| 68000|
|         [30.0,5.0]| 70000|
|         [33.0,7.0]| 80000|
|         [34.0,8.0]| 85000|
|         [35.0,8.0]| 90000|
|         [36.0,9.0]| 95000|
|        [38.0,10.0]|100000|
|        [40.0,12.0]|120000|
+-------------------+------+

+-------------------+------+
|IndependentFeatures|Salary|
+-------------------+------+
|         [25.0,2.0]| 50000|
|         [27.0,5.0]| 70000|
|         [28.0,3.0]| 60000|
|         [31.0,6.0]| 72000|
|         [32.0,6.0]| 75000|
|        [45.0,15.0]|130000|
+-------------------+------+



(None, None)

In [23]:
regressor.coefficients

DenseVector([1261.7762, 4314.3855])

In [24]:
regressor.intercept

11329.9460146008

In [27]:
predected_results = regressor.evaluate(test_data)
predected_results.predictions.show()

+-------------------+------+------------------+
|IndependentFeatures|Salary|        prediction|
+-------------------+------+------------------+
|         [25.0,2.0]| 50000| 51503.12268445009|
|         [27.0,5.0]| 70000| 66969.83169259998|
|         [28.0,3.0]| 60000| 59602.83687943309|
|         [31.0,6.0]| 72000| 76331.32211283999|
|         [32.0,6.0]| 75000| 77593.09833809701|
|        [45.0,15.0]|130000|132825.65893934577|
+-------------------+------+------------------+



In [32]:
print(predected_results.meanAbsoluteError)

2446.755583783299


In [None]:
# 01:30:0