In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName("5").getOrCreate()
spark

In [4]:
df = spark.read.csv(r"C:\Stack overflow\Py-Spark\records.csv", header=True, inferSchema=True)
df.show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
| Tharun| 31|        10| 30000|
| Naveen| 30|         8| 25000|
|  Aqeel| 29|         4| 20000|
| Aariff| 24|         3| 20000|
| Harish| 21|         1| 15000|
|Praveen| 23|         2| 18000|
+-------+---+----------+------+



In [7]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [9]:
features = VectorAssembler(inputCols=["Age", "Experience"], outputCol="IndependentFeature")

In [11]:
output = features.transform(df)

In [12]:
output.show()

+-------+---+----------+------+------------------+
|   Name|Age|Experience|Salary|IndependentFeature|
+-------+---+----------+------+------------------+
| Tharun| 31|        10| 30000|       [31.0,10.0]|
| Naveen| 30|         8| 25000|        [30.0,8.0]|
|  Aqeel| 29|         4| 20000|        [29.0,4.0]|
| Aariff| 24|         3| 20000|        [24.0,3.0]|
| Harish| 21|         1| 15000|        [21.0,1.0]|
|Praveen| 23|         2| 18000|        [23.0,2.0]|
+-------+---+----------+------+------------------+



In [13]:
final = output.select(["IndependentFeature", "Salary"])
final.show()

+------------------+------+
|IndependentFeature|Salary|
+------------------+------+
|       [31.0,10.0]| 30000|
|        [30.0,8.0]| 25000|
|        [29.0,4.0]| 20000|
|        [24.0,3.0]| 20000|
|        [21.0,1.0]| 15000|
|        [23.0,2.0]| 18000|
+------------------+------+



In [15]:
train, test = final.randomSplit([0.80, 0.20])
regressor = LinearRegression(featuresCol="IndependentFeature", labelCol="Salary")
regressor = regressor.fit(train)

In [16]:
regressor.coefficients

DenseVector([-714.2857, 3485.7143])

In [17]:
pred = regressor.evaluate(test)

In [20]:
pred.predictions.show()

+------------------+------+-----------------+
|IndependentFeature|Salary|       prediction|
+------------------+------+-----------------+
|        [30.0,8.0]| 25000|33314.28571428435|
|       [31.0,10.0]| 30000|39571.42857142652|
+------------------+------+-----------------+



In [21]:
pred.meanAbsoluteError

8942.857142855435