## ***In this tutorial we will learn about MLib.***

In [None]:
!pip install pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('mlib').getOrCreate()
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
training = spark.read.csv('test1.csv',header = True, inferSchema = True)
training.show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
|  Arjun| 24|         9| 900000|
|  Bhim | 26|         7| 700000|
+-------+---+----------+-------+



In [None]:
training.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int'), ('salary', 'int')]

In [None]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [None]:
training.columns

['name', 'age', 'experience', 'salary']

To create a training set we need to group all our independent features into a vector format. To do that we will import VectorAssembler.

['independent_v1','independent_v2',..] --> [New Feature] --> [Final Independent Feature Vecctor]

In [None]:
from pyspark.ml.feature import VectorAssembler

featureassembler = VectorAssembler(inputCols = ['age','experience'],outputCol = 'independent feature')

In [None]:
output = featureassembler.transform(training)
output.show()

+-------+---+----------+-------+-------------------+
|   name|age|experience| salary|independent feature|
+-------+---+----------+-------+-------------------+
| Uttam | 21|        10|1000000|        [21.0,10.0]|
| Karan | 23|         4| 400000|         [23.0,4.0]|
|Krishna| 28|        50|5000000|        [28.0,50.0]|
|  Arjun| 24|         9| 900000|         [24.0,9.0]|
|  Bhim | 26|         7| 700000|         [26.0,7.0]|
+-------+---+----------+-------+-------------------+



In [None]:
final_data = output.select('Independent feature','salary')
final_data.show()

+-------------------+-------+
|Independent feature| salary|
+-------------------+-------+
|        [21.0,10.0]|1000000|
|         [23.0,4.0]| 400000|
|        [28.0,50.0]|5000000|
|         [24.0,9.0]| 900000|
|         [26.0,7.0]| 700000|
+-------------------+-------+



We have created our final data. Now we will build a ML algorithm.

In [None]:
from pyspark.ml.regression import LinearRegression

# Train Test Split: 
# Here a weird 40 - 60 % train test split has been taken 
# because the set is toooooo small. Normal 80-20% was not working. 
# You can take 80-20 or 75-25 or whatever percentage you feel like if you have a large dataset.

train_data,test_data = final_data.randomSplit([0.6,0.4])
regressor = LinearRegression(featuresCol= 'Independent feature',labelCol = 'salary')
regressor = regressor.fit(train_data)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f3917f2da70>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LinearRegression' object has no attribute '_java_obj'


In [None]:
train_data.show()

+-------------------+-------+
|Independent feature| salary|
+-------------------+-------+
|        [21.0,10.0]|1000000|
|         [24.0,9.0]| 900000|
|         [26.0,7.0]| 700000|
+-------------------+-------+



In [None]:
test_data.show()

+-------------------+-------+
|Independent feature| salary|
+-------------------+-------+
|         [23.0,4.0]| 400000|
|        [28.0,50.0]|5000000|
+-------------------+-------+



In [None]:
# Coeefficients

regressor.coefficients

DenseVector([-0.0, 100000.0])

In [None]:
# Intercepts

regressor.intercept

4.097163364774425e-07

In [None]:
# Prediction

pred_result = regressor.evaluate(test_data)
pred_result.predictions.show()

+-------------------+-------+-----------------+
|Independent feature| salary|       prediction|
+-------------------+-------+-----------------+
|         [23.0,4.0]| 400000|400000.0000000883|
|        [28.0,50.0]|5000000|4999999.999999235|
+-------------------+-------+-----------------+



In [None]:
pred_result.meanAbsoluteError, pred_result.meanSquaredError, pred_result.rootMeanSquaredError

(4.2645842768251896e-07, 2.9621722182602856e-13, 5.442584145661218e-07)