In [28]:
pip install pyspark



In [29]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('practise').getOrCreate()

In [9]:
df_pyspark=spark.read.csv('data5.csv', header=True, inferSchema=True)

In [10]:
df_pyspark.show()

+-------+---+----------+------+
|   Name|age|experience|salary|
+-------+---+----------+------+
| roheth| 31|        10| 30000|
|roshaan| 30|         8| 25000|
|   banu| 29|         4| 20000|
|  dhoni| 24|         3| 20000|
|  virat| 21|         1| 15000|
|  raina| 23|         2| 18000|
+-------+---+----------+------+



In [24]:
#[age,experience] --->new feauture-->independent feautre
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age", "experience"],outputCol="independent feature")

In [30]:
output = featureassembler.transform(df_pyspark)

In [32]:
output.show()

+-------+---+----------+------+--------------------+
|   Name|age|experience|salary|Independent Features|
+-------+---+----------+------+--------------------+
| roheth| 31|        10| 30000|         [31.0,10.0]|
|roshaan| 30|         8| 25000|          [30.0,8.0]|
|   banu| 29|         4| 20000|          [29.0,4.0]|
|  dhoni| 24|         3| 20000|          [24.0,3.0]|
|  virat| 21|         1| 15000|          [21.0,1.0]|
|  raina| 23|         2| 18000|          [23.0,2.0]|
+-------+---+----------+------+--------------------+



In [33]:
#input -> independent features
#outout -> salary

finalized_data=output.select("Independent Features", "salary" )

In [34]:
finalized_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [37]:
##train test split
from pyspark.ml.regression import LinearRegression
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='salary')
regressor=regressor.fit(train_data)

In [38]:
regressor.coefficients

DenseVector([-526.3158, 2105.2632])

In [39]:
regressor.intercept

24868.42105263305

In [40]:
pred_results=regressor.evaluate(test_data)

In [41]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|salary|       prediction|
+--------------------+------+-----------------+
|          [23.0,2.0]| 18000|16973.68421052627|
|          [29.0,4.0]| 20000|18026.31578947337|
+--------------------+------+-----------------+



In [42]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(1500.00000000018, 2474376.731302606)

In [51]:
from pyspark.ml.linalg import Vectors

# Create a new DataFrame with the input data
input_data = [(31,10), (28, 2)]
input_df = spark.createDataFrame(input_data, ["age", "experience"])

# Assemble the independent features into a vector
input_assembler = VectorAssembler(inputCols=["age", "experience"], outputCol="Independent Features")
input_output = input_assembler.transform(input_df)

# Select the required columns for prediction
input_features = input_output.select("Independent Features")

# Show the input data
input_output.show()


+---+----------+--------------------+
|age|experience|Independent Features|
+---+----------+--------------------+
| 31|        10|         [31.0,10.0]|
| 28|         2|          [28.0,2.0]|
+---+----------+--------------------+



In [56]:
# Make predictions on the input data
predictions = regressor.transform(input_features)

# Show the predicted results
predictions.show()

+--------------------+------------------+
|Independent Features|        prediction|
+--------------------+------------------+
|         [31.0,10.0]| 29605.26315789479|
|          [28.0,2.0]|14342.105263157482|
+--------------------+------------------+

