# Example of Pyspark ML

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('ML01').getOrCreate()



In [2]:
# Read dataset
training=spark.read.csv('Test.csv',header=True,inferSchema=True)
training.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Owais| 34|         7| 40000|
|Zubair| 35|        10| 45000|
| Aijaz| 34|         2| 12000|
| Bilal| 35|         8| 70000|
|Ishfaq| 28|         4| 22000|
|Sameer| 25|         1| 15000|
|Ikhlaq| 46|         9| 41000|
|  Sonu| 36|         2| 23000|
+------+---+----------+------+



In [3]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

- In in other techniques we do train and test split and assign that to X - independent var and Y - dependent varible
# For pyspark we do things differently 
- [Age,Experience] -----> as a group-----> new feature -----> independent feature 
- To group [Age,Experience] in pyspark we do VectorAssembler 
- Here we use the input coulums (numeric) like Age, Salary in our case and change that 
- into an independent feature [Age, Experience] as a single group/feature using VectorAssebler

In [10]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Age","Experience"],
                                 outputCol="Independent Features")

In [11]:
output=featureassembler.transform(training)

In [12]:
# we have added a new independent feature as showm
output.show()

+------+---+----------+------+--------------------+
|  Name|Age|Experience|Salary|Independent Features|
+------+---+----------+------+--------------------+
| Owais| 34|         7| 40000|          [34.0,7.0]|
|Zubair| 35|        10| 45000|         [35.0,10.0]|
| Aijaz| 34|         2| 12000|          [34.0,2.0]|
| Bilal| 35|         8| 70000|          [35.0,8.0]|
|Ishfaq| 28|         4| 22000|          [28.0,4.0]|
|Sameer| 25|         1| 15000|          [25.0,1.0]|
|Ikhlaq| 46|         9| 41000|          [46.0,9.0]|
|  Sonu| 36|         2| 23000|          [36.0,2.0]|
+------+---+----------+------+--------------------+



In [13]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [14]:
output.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Independent Features: vector (nullable = true)



In [16]:
finalized_data=output.select("Independent Features","Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [34.0,7.0]| 40000|
|         [35.0,10.0]| 45000|
|          [34.0,2.0]| 12000|
|          [35.0,8.0]| 70000|
|          [28.0,4.0]| 22000|
|          [25.0,1.0]| 15000|
|          [46.0,9.0]| 41000|
|          [36.0,2.0]| 23000|
+--------------------+------+



In [18]:
# Now we will do linear regression
from pyspark.ml.regression import LinearRegression
# do the train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(train_data)

In [19]:
# Coefficients
regressor.coefficients

DenseVector([13.6238, 3509.8436])

In [20]:
# Intercepts
regressor.intercept

10273.288205917817

In [21]:
## Prediction
pred_results=regressor.evaluate(test_data)

In [23]:
pred_results.predictions.show()



+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [35.0,8.0]| 70000|38828.86871344091|
+--------------------+------+-----------------+



In [24]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(31171.131286559088, 971639425.6839029)