* There are multiple ways to use apache spark, one is DataFrame API
### Exapmples Of Pyspark ML

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DF").getOrCreate()

In [3]:
training = spark.read.csv("test.csv", header=True, inferSchema=True)

In [5]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [19]:
# [Age, Experience] ---> new feature ---> independent feature
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Age", "Experience"], outputCol="Independent Features")

In [20]:
output = featureassembler.transform(training)
output.show()

+------+---+----------+------+--------------------+
|  Name|Age|Experience|Salary|Independent Features|
+------+---+----------+------+--------------------+
|   Ali| 31|         4| 30000|          [31.0,4.0]|
|   Ali| 24|         6| 25000|          [24.0,6.0]|
| Fatma| 55|         7| 59994|          [55.0,7.0]|
|Mehmet| 23|         2| 13231|          [23.0,2.0]|
|  Ayse| 54|         6| 93211|          [54.0,6.0]|
| Harun| 32|         7| 13231|          [32.0,7.0]|
+------+---+----------+------+--------------------+



In [21]:
finalized_data = output.select("Independent Features","Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [31.0,4.0]| 30000|
|          [24.0,6.0]| 25000|
|          [55.0,7.0]| 59994|
|          [23.0,2.0]| 13231|
|          [54.0,6.0]| 93211|
|          [32.0,7.0]| 13231|
+--------------------+------+



In [22]:
from pyspark.ml.regression import LinearRegression
# train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
regressor = regressor.fit(train_data)


In [23]:
regressor.coefficients

DenseVector([7883.2593, -41100.2593])

In [24]:
regressor.intercept

-85883.4444444461

In [25]:
pred_result = regressor.evaluate(test_data)

In [26]:
pred_result.predictions.show()

+--------------------+------+-------------------+
|Independent Features|Salary|         prediction|
+--------------------+------+-------------------+
|          [24.0,6.0]| 25000|-143286.77777778186|
|          [31.0,4.0]| 30000| -5903.444444445398|
|          [32.0,7.0]| 13231|-121320.96296296685|
+--------------------+------+-------------------+



In [27]:
pred_result.meanAbsoluteError, pred_result.meanSquaredError

(112914.06172839804, 15904575878.330505)