In [186]:
import pandas as pd

data = [
    ("Alice", 30, 5, 70000),
    ("Bob", 25, 2, 50000),
    ("Charlie", 35, 10, 100000),
    ("Diana", 28, 4, 60000),
    ("Ethan", 40, 15, 120000)
]

columns = ["name", "age", "experience", "salary"]

df = pd.DataFrame(data, columns=columns)

df.to_csv("mlLib.csv", index=False)

df


Unnamed: 0,name,age,experience,salary
0,Alice,30,5,70000
1,Bob,25,2,50000
2,Charlie,35,10,100000
3,Diana,28,4,60000
4,Ethan,40,15,120000


In [187]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MLLib Example").getOrCreate() 

In [188]:
training = spark.read.csv("mlLib.csv", header=True, inferSchema=True)
training.show()
training.printSchema()
training.columns

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  Alice| 30|         5| 70000|
|    Bob| 25|         2| 50000|
|Charlie| 35|        10|100000|
|  Diana| 28|         4| 60000|
|  Ethan| 40|        15|120000|
+-------+---+----------+------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



['name', 'age', 'experience', 'salary']

['age', 'experience'] ---> new feature ---> independent feature

In [189]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["age", "experience"], outputCol="independent features")

df_assembled = featureassembler.transform(training)
df_assembled.show()

+-------+---+----------+------+--------------------+
|   name|age|experience|salary|independent features|
+-------+---+----------+------+--------------------+
|  Alice| 30|         5| 70000|          [30.0,5.0]|
|    Bob| 25|         2| 50000|          [25.0,2.0]|
|Charlie| 35|        10|100000|         [35.0,10.0]|
|  Diana| 28|         4| 60000|          [28.0,4.0]|
|  Ethan| 40|        15|120000|         [40.0,15.0]|
+-------+---+----------+------+--------------------+



In [190]:
finalised_data = df_assembled.select("independent features", "salary")
finalised_data.show()

+--------------------+------+
|independent features|salary|
+--------------------+------+
|          [30.0,5.0]| 70000|
|          [25.0,2.0]| 50000|
|         [35.0,10.0]|100000|
|          [28.0,4.0]| 60000|
|         [40.0,15.0]|120000|
+--------------------+------+



In [191]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalised_data.randomSplit([0.75, 0.25])
regresser = LinearRegression(featuresCol="independent features", labelCol="salary")
regresser = regresser.fit(train_data)

25/06/27 23:44:41 WARN Instrumentation: [98c4a12f] regParam is zero, which might cause numerical instability and overfitting.


In [192]:
regresser.coefficients

DenseVector([2439.8625, 2628.866])

In [193]:
regresser.intercept

-17113.402061865876

In [194]:
pred_results = regresser.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+-----------------+
|independent features|salary|       prediction|
+--------------------+------+-----------------+
|         [35.0,10.0]|100000|94570.44673539535|
+--------------------+------+-----------------+



In [195]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(5429.553264604649, 29480048.653178997)