In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MLlib").getOrCreate()

In [2]:
training = spark.read.csv(r"C:\Users\bhupe\OneDrive\Desktop\Portfolio Project\SQL Project\Interview Prep\emp2.csv", header = True, inferSchema= True)
training.show()

+------+-------+----+-------+---+---+----------+
|emp_id|   name|dept|salary |age|sex|experience|
+------+-------+----+-------+---+---+----------+
|     1|  argha|sale|  10000| 25|  M|         2|
|     2|  sohag|tech|  20000| 25|  M|         3|
|     7|  soura|tech|  30000| 25|  M|         2|
|     4| shivam|tech|  10000| 26|  F|        10|
|     5|  lohit|  hr|  40000| 26|  F|         5|
|     6|prakash|  hr|  50000| 26|  M|         6|
|     8|sourabh|tech|  60000| 24|  F|         5|
|    10|  biswa|sale|  50000| 27|  M|         2|
|     7|  soura|tech|  30000| 25|  M|         3|
|     6|prakash|  hr|  50000| 26|  M|         2|
|     4| shivam|tech|  10000| 26|  F|         1|
+------+-------+----+-------+---+---+----------+



In [3]:
training.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary : integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- experience: integer (nullable = true)



In [5]:
training = training.withColumnRenamed("salary ", "salary")

In [6]:
training.columns

['emp_id', 'name', 'dept', 'salary', 'age', 'sex', 'experience']

In [7]:
## create Independant feature by using VectorAssembler
from pyspark.ml.feature import VectorAssembler

In [8]:
assemble_feature = VectorAssembler(
    inputCols = ["experience","age"],outputCol = "Independant feature"
)

In [9]:
output = assemble_feature.transform(training)

In [10]:
output.show()

+------+-------+----+------+---+---+----------+-------------------+
|emp_id|   name|dept|salary|age|sex|experience|Independant feature|
+------+-------+----+------+---+---+----------+-------------------+
|     1|  argha|sale| 10000| 25|  M|         2|         [2.0,25.0]|
|     2|  sohag|tech| 20000| 25|  M|         3|         [3.0,25.0]|
|     7|  soura|tech| 30000| 25|  M|         2|         [2.0,25.0]|
|     4| shivam|tech| 10000| 26|  F|        10|        [10.0,26.0]|
|     5|  lohit|  hr| 40000| 26|  F|         5|         [5.0,26.0]|
|     6|prakash|  hr| 50000| 26|  M|         6|         [6.0,26.0]|
|     8|sourabh|tech| 60000| 24|  F|         5|         [5.0,24.0]|
|    10|  biswa|sale| 50000| 27|  M|         2|         [2.0,27.0]|
|     7|  soura|tech| 30000| 25|  M|         3|         [3.0,25.0]|
|     6|prakash|  hr| 50000| 26|  M|         2|         [2.0,26.0]|
|     4| shivam|tech| 10000| 26|  F|         1|         [1.0,26.0]|
+------+-------+----+------+---+---+----------+-

In [12]:
## Take 2 necessary columns
finalized_data = output.select(["Independant feature","salary"])
finalized_data.show()

+-------------------+------+
|Independant feature|salary|
+-------------------+------+
|         [2.0,25.0]| 10000|
|         [3.0,25.0]| 20000|
|         [2.0,25.0]| 30000|
|        [10.0,26.0]| 10000|
|         [5.0,26.0]| 40000|
|         [6.0,26.0]| 50000|
|         [5.0,24.0]| 60000|
|         [2.0,27.0]| 50000|
|         [3.0,25.0]| 30000|
|         [2.0,26.0]| 50000|
|         [1.0,26.0]| 10000|
+-------------------+------+



In [14]:
## Use Linear Regression for predict salary base on Inependant feature
from pyspark.ml.regression import LinearRegression


In [56]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regression = LinearRegression(featuresCol = "Independant feature", labelCol = "salary")
regressor = regression.fit(train_data)

In [57]:
train_data.show()

+-------------------+------+
|Independant feature|salary|
+-------------------+------+
|         [2.0,25.0]| 10000|
|         [2.0,25.0]| 30000|
|         [2.0,26.0]| 50000|
|         [2.0,27.0]| 50000|
|         [5.0,26.0]| 40000|
|         [6.0,26.0]| 50000|
+-------------------+------+



In [48]:
regressor.coefficients

DenseVector([5813.9535, -1162.7907])

In [49]:
regressor.intercept

48139.5348837087

In [50]:
predict_result = regressor.evaluate(test_data)
predict_result.predictions.show()

+-------------------+------+------------------+
|Independant feature|salary|        prediction|
+-------------------+------+------------------+
|         [1.0,26.0]| 10000|23720.930232558516|
|         [2.0,27.0]| 50000|28372.093023256653|
|         [3.0,25.0]| 20000|36511.627906976595|
|         [3.0,25.0]| 30000|36511.627906976595|
|        [10.0,26.0]| 10000|  76046.5116279072|
+-------------------+------+------------------+



In [51]:
predict_result.meanAbsoluteError, predict_result.meanSquaredError

(24883.72093023245, 1066641427.7988094)

In [63]:
for i in range(-5,6,1):
    print(i)

-5
-4
-3
-2
-1
0
1
2
3
4
5
