## Example of Pysaprk ML 

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [3]:
# Read the dataset
training =spark.read.option('header','true').csv(r'C:\Users\SAMEER\OneDrive\Desktop\Data Science 6pm\Spark\sample.csv', inferSchema=True)

In [4]:
training.show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
| John| 38|         9| 35000|
|Roman| 37|         7| 30000|
| Rock| 40|        10| 40000|
|orton| 37|         8| 33000|
| Seth| 30|         5| 26000|
| Dean| 29|         4| 25000|
+-----+---+----------+------+



In [6]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [8]:
from pyspark.ml.feature import VectorAssembler 
featureassembler = VectorAssembler(inputCols=["Age","Experience"], outputCol='Independent Features')

In [9]:
output = featureassembler.transform(training)

In [10]:
output.show()

+-----+---+----------+------+--------------------+
| Name|Age|Experience|Salary|Independent Features|
+-----+---+----------+------+--------------------+
| John| 38|         9| 35000|          [38.0,9.0]|
|Roman| 37|         7| 30000|          [37.0,7.0]|
| Rock| 40|        10| 40000|         [40.0,10.0]|
|orton| 37|         8| 33000|          [37.0,8.0]|
| Seth| 30|         5| 26000|          [30.0,5.0]|
| Dean| 29|         4| 25000|          [29.0,4.0]|
+-----+---+----------+------+--------------------+



In [12]:
finalized_data = output.select('Independent Features','Salary')

In [13]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [38.0,9.0]| 35000|
|          [37.0,7.0]| 30000|
|         [40.0,10.0]| 40000|
|          [37.0,8.0]| 33000|
|          [30.0,5.0]| 26000|
|          [29.0,4.0]| 25000|
+--------------------+------+



In [14]:
from pyspark.ml.regression import LinearRegression
# train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'Salary')
regressor = regressor.fit(train_data)

In [15]:
# Coefficients
regressor.coefficients

DenseVector([-392.1335, 3162.0977])

In [16]:
# Intercepts
regressor.intercept

22711.56138259817

In [17]:
# Prediction
pred_results = regressor.evaluate(test_data)

In [18]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [37.0,8.0]| 33000|33499.40405244339|
+--------------------+------+-----------------+



In [19]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(499.4040524433876, 249404.40759687786)