In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[5]") \
        .appName("Linear Regression") \
        .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/17 11:45:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sample_data = [["A", 45, 12, 5000],
               ["B", 40, 7, 4300],
               ["C", 50, 16, 5600],
               ["D", 34, 5, 3800],
               ["E", 42, 6, 4100]]

schema = ["Name", "Age", "Experience", "Salary"]
df = spark.createDataFrame(sample_data, schema=schema)
df.printSchema()
df.show()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Experience: long (nullable = true)
 |-- Salary: long (nullable = true)



                                                                                

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   A| 45|        12|  5000|
|   B| 40|         7|  4300|
|   C| 50|        16|  5600|
|   D| 34|         5|  3800|
|   E| 42|         6|  4100|
+----+---+----------+------+



*For PySpark independent variables must be in a list format*


*[x1, x2, x3, x4, ... , xn] and [y] --> use VectorAssembler*

*VectorAssembler merges multiple columns into a vector*

In [4]:
from pyspark.ml.feature import VectorAssembler

independent_features = VectorAssembler(inputCols=['Age', 'Experience'], outputCol="Independent Features")
output = independent_features.transform(df)
output.show()

                                                                                

+----+---+----------+------+--------------------+
|Name|Age|Experience|Salary|Independent Features|
+----+---+----------+------+--------------------+
|   A| 45|        12|  5000|         [45.0,12.0]|
|   B| 40|         7|  4300|          [40.0,7.0]|
|   C| 50|        16|  5600|         [50.0,16.0]|
|   D| 34|         5|  3800|          [34.0,5.0]|
|   E| 42|         6|  4100|          [42.0,6.0]|
+----+---+----------+------+--------------------+



In [6]:
data = output.select("Independent Features", "Salary")
data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [45.0,12.0]|  5000|
|          [40.0,7.0]|  4300|
|         [50.0,16.0]|  5600|
|          [34.0,5.0]|  3800|
|          [42.0,6.0]|  4100|
+--------------------+------+



In [31]:
# Train test split
train_data, test_data = data.randomSplit([0.6, 0.4])

In [32]:
train_data.show()
test_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [45.0,12.0]|  5000|
|          [40.0,7.0]|  4300|
|          [42.0,6.0]|  4100|
+--------------------+------+

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [50.0,16.0]|  5600|
|          [34.0,5.0]|  3800|
+--------------------+------+



In [33]:
# LinearRegression
from pyspark.ml.regression import LinearRegression

lin_reg = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
lin_reg_fit = lin_reg.fit(train_data)

23/12/17 12:08:16 WARN Instrumentation: [4f3ed6f1] regParam is zero, which might cause numerical instability and overfitting.


In [34]:
lin_reg_fit.coefficients

DenseVector([-20.0, 160.0])

In [35]:
lin_reg_fit.intercept

3980.000000000845

In [37]:
pred = lin_reg_fit.evaluate(test_data)
pred.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|         [50.0,16.0]|  5600|5539.999999999936|
|          [34.0,5.0]|  3800|4100.000000000142|
+--------------------+------+-----------------+



In [38]:
pred.meanSquaredError, pred.r2

(46800.000000046384, 0.942222222222165)

In [39]:
spark.stop()