In [104]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Pyspark ML 1').getOrCreate()

In [105]:
df = spark.read.csv("C:/Users/Acer/Desktop/Pyspark data sets/test4.csv", inferSchema = True, header = True)

In [106]:
df.show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   A| 24|         6| 24900|
|   B| 26|         7| 27050|
|   C| 22|        10| 23500|
|   D| 29|        10| 30500|
|   E| 25|        10| 26500|
|   F| 24|         5| 24750|
|   G| 22|        10| 23500|
|   H| 21|         7| 22050|
|   I| 23|         8| 24200|
|   J| 23|         8| 24200|
+----+---+----------+------+



In [107]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [108]:
df.columns

['Name', 'Age', 'Experience', 'Salary']

In [109]:
from pyspark.ml.feature import VectorAssembler

In [110]:
feature_assembler = VectorAssembler(inputCols = ['Age', 'Experience'], outputCol = 'Independent Features')

In [111]:
#### inputCol's' but outputCol - remember
# this was to make age and experience together as a new independent feature - the predictor

In [112]:
df_1  = feature_assembler.transform(df)            

In [113]:
df_1.show()

+----+---+----------+------+--------------------+
|Name|Age|Experience|Salary|Independent Features|
+----+---+----------+------+--------------------+
|   A| 24|         6| 24900|          [24.0,6.0]|
|   B| 26|         7| 27050|          [26.0,7.0]|
|   C| 22|        10| 23500|         [22.0,10.0]|
|   D| 29|        10| 30500|         [29.0,10.0]|
|   E| 25|        10| 26500|         [25.0,10.0]|
|   F| 24|         5| 24750|          [24.0,5.0]|
|   G| 22|        10| 23500|         [22.0,10.0]|
|   H| 21|         7| 22050|          [21.0,7.0]|
|   I| 23|         8| 24200|          [23.0,8.0]|
|   J| 23|         8| 24200|          [23.0,8.0]|
+----+---+----------+------+--------------------+



In [114]:
f_df = df_1.select('Independent features', 'Salary')

In [115]:
f_df.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [24.0,6.0]| 24900|
|          [26.0,7.0]| 27050|
|         [22.0,10.0]| 23500|
|         [29.0,10.0]| 30500|
|         [25.0,10.0]| 26500|
|          [24.0,5.0]| 24750|
|         [22.0,10.0]| 23500|
|          [21.0,7.0]| 22050|
|          [23.0,8.0]| 24200|
|          [23.0,8.0]| 24200|
+--------------------+------+



In [189]:
from pyspark.ml.regression import LinearRegression

In [192]:
train_data, test_data = f_df.randomSplit([0.8,0.2])              ### I did not put the [] earlier and it was causing errors

In [193]:
train_data.show()
test_data.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [21.0,7.0]| 22050|
|         [22.0,10.0]| 23500|
|         [22.0,10.0]| 23500|
|          [23.0,8.0]| 24200|
|          [23.0,8.0]| 24200|
|          [24.0,6.0]| 24900|
|          [26.0,7.0]| 27050|
|         [29.0,10.0]| 30500|
+--------------------+------+

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [24.0,5.0]| 24750|
|         [25.0,10.0]| 26500|
+--------------------+------+



In [194]:
regressor=LinearRegression(featuresCol='Independent features', labelCol='Salary')

In [195]:
regressor = regressor.fit(train_data)    #model created and fitted

In [196]:
regressor       # so the regressor is working , why is it giving me errors for coefficients then

LinearRegressionModel: uid=LinearRegression_039c41961c10, numFeatures=2

## Coefficients

In [197]:
regressor.coefficients

DenseVector([1000.0, 150.0])

## Intercepts

In [198]:
regressor.intercept

2.0436233656461416e-10

### Note: I have to write regressor = regressor.fit(X_train)  - if I do not, it shows error

## Prediction

In [199]:
pred = regressor.evaluate(test_data)

In [200]:
pred.predictions.show()           ## this is now how it is supposed to be

+--------------------+------+------------------+
|Independent features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,5.0]| 24750|24749.999999999996|
|         [25.0,10.0]| 26500|26499.999999999996|
+--------------------+------+------------------+



In [202]:
test_data.show()            ### pretty fucking close because, I made the data this way

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|          [24.0,5.0]| 24750|
|         [25.0,10.0]| 26500|
+--------------------+------+



In [203]:
pred.meanAbsoluteError, pred.meanSquaredError

(3.637978807091713e-12, 1.3234889800848446e-23)