In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Varun').getOrCreate()

In [3]:
df3=spark.read.csv('C:/Users/acer/Downloads/test1.csv',header=True,inferSchema=True)

In [4]:
df3.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
df3.limit(4).show() 

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
+---------+---+----------+------+



In [6]:
#retrieving independent features
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","Experience"],outputCol="Independent Features")

In [7]:
output=featureassembler.transform(df3)

In [8]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [9]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Features']

In [10]:
#final data for prediction
final_data=output.select("Independent Features","Salary")

In [11]:
final_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [12]:
from pyspark.ml.regression import LinearRegression
#splitting
train_data,test_data=final_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(train_data)

In [13]:
regressor.coefficients  

DenseVector([-263.7076, 1767.624])

In [14]:
res=regressor.evaluate(train_data)

In [15]:
res.r2

0.9676033106753748

In [16]:
#Intercepts
regressor.intercept

19919.060052212404

In [17]:
test_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [29.0,4.0]| 20000|
+--------------------+------+



In [18]:
pred_results=regressor.evaluate(test_data)

In [19]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [29.0,4.0]| 20000|19342.03655352618|
+--------------------+------+-----------------+



In [20]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(657.9634464738192, 432915.89689570636)

In [21]:
#without labels
x=test_data.select('Independent Features')

In [22]:
x.show()

+--------------------+
|Independent Features|
+--------------------+
|          [29.0,4.0]|
+--------------------+



In [23]:
pred2=regressor.transform(x)
pred2.show()

+--------------------+-----------------+
|Independent Features|       prediction|
+--------------------+-----------------+
|          [29.0,4.0]|19342.03655352618|
+--------------------+-----------------+



Manual calcution

In [35]:
import numpy as np

X = np.array([[1, 31, 10],[1, 30, 8],[1, 29, 4], [1, 24, 3]])
y = np.array([30000, 25000, 20000, 20000])
#normal equation
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print(theta)
b, w1, w2 = theta

[23998.33055092  -383.97328881  1711.18530885]


In [36]:
X_train = np.array([[1,31.0, 10.0],[1,30.0, 8.0],[1,29.0, 4.0],[1,24.0, 3.0],[1,21.0, 1.0],[1,23.0, 2.0]])
y_train = np.array([30000, 25000, 20000, 20000, 15000, 18000])

In [37]:
#theta calculation
theta = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)

In [41]:
b, w1, w2 = theta
print('intercept-',b,'\ncoeffients for age:',w1,' & Experience: ',w2)

intercept- 16904.79523976019 
coeffients for age: -115.85579278960506  & Experience:  1602.730136506825


In [42]:
#prediction on test data
X_test = np.array([29.0, 4.0])
salary = b + w1 * X_test[0] + w2 * X_test[1]
salary

19955.897794888944

In [43]:
#vales from the model 
b=19919.060052212404
w1,w2=-263.7076,1767.624
X_test = np.array([29.0, 4.0])
salary = b + w1 * X_test[0] + w2 * X_test[1]
salary

19342.035652212402