In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Create SparkSession

In [2]:
spark = (SparkSession.builder
                     .appName("MLLib")
                     .getOrCreate())

23/07/12 19:27:02 WARN Utils: Your hostname, myThinkPad resolves to a loopback address: 127.0.1.1; using 192.168.29.222 instead (on interface wlp3s0)
23/07/12 19:27:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/12 19:27:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/12 19:27:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/12 19:27:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/07/12 19:27:04 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/07/12 19:27:04 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
path = 'Data_Science_Bootcamp/Regression_Algorithms/Simple_Linear_Regression/Student_Grades_Data.csv'


# Read in the student data
df = spark.read.csv(path, header=True, inferSchema=True)
print('-----Read the raw csv--------')
df.limit(5).show(truncate=False)
df.printSchema()




-----Read the raw csv--------
+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|1            |1.5   |
|5            |2.7   |
|7            |3.1   |
|3            |2.1   |
|2            |1.8   |
+-------------+------+

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)



# Create a Feature array by omitting the last column

In [4]:

feature_cols = df.columns[:-1] 
print(feature_cols)

from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")
print(vect_assembler)
print('***********************************************')
#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(df)
data_w_features.show()

['Time_to_Study']
VectorAssembler_32038c50123c
***********************************************
+-------------+------+--------+
|Time_to_Study|Grades|features|
+-------------+------+--------+
|            1|   1.5|   [1.0]|
|            5|   2.7|   [5.0]|
|            7|   3.1|   [7.0]|
|            3|   2.1|   [3.0]|
|            2|   1.8|   [2.0]|
|            9|   3.9|   [9.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|           11|   4.3|  [11.0]|
|            2|   1.8|   [2.0]|
|            4|   2.4|   [4.0]|
|            8|   3.5|   [8.0]|
|           13|   4.8|  [13.0]|
|            9|   3.9|   [9.0]|
|           14|   5.0|  [14.0]|
|           10|   4.1|  [10.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|            1|   1.5|   [1.0]|
|            4|   2.4|   [4.0]|
+-------------+------+--------+
only showing top 20 rows



Clean the Data

In [5]:
## Display the data having additional column named features. Had it been multiple linear regression problem, 
## you could see all the independent variable values combined in one list
data_w_features.show(5)

+-------------+------+--------+
|Time_to_Study|Grades|features|
+-------------+------+--------+
|            1|   1.5|   [1.0]|
|            5|   2.7|   [5.0]|
|            7|   3.1|   [7.0]|
|            3|   2.1|   [3.0]|
|            2|   1.8|   [2.0]|
+-------------+------+--------+
only showing top 5 rows



In [6]:
#Select only Features and Label from previous dataset as we need these two entities for building machine learning model

finalized_data = data_w_features.select("features","Grades")
finalized_data.show()

+--------+------+
|features|Grades|
+--------+------+
|   [1.0]|   1.5|
|   [5.0]|   2.7|
|   [7.0]|   3.1|
|   [3.0]|   2.1|
|   [2.0]|   1.8|
|   [9.0]|   3.9|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|  [11.0]|   4.3|
|   [2.0]|   1.8|
|   [4.0]|   2.4|
|   [8.0]|   3.5|
|  [13.0]|   4.8|
|   [9.0]|   3.9|
|  [14.0]|   5.0|
|  [10.0]|   4.1|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|   [1.0]|   1.5|
|   [4.0]|   2.4|
+--------+------+
only showing top 20 rows



# Split the data into train and test dataset

In [7]:
#Split the data into training and test model with 70% obs. going in training and 30% in testing
train_dataset, test_dataset = finalized_data.randomSplit([0.8, 0.2])
train_dataset.describe().show()
test_dataset.describe().show()

+-------+------------------+
|summary|            Grades|
+-------+------------------+
|  count|                40|
|   mean|            3.2875|
| stddev|1.1216259946144578|
|    min|               1.5|
|    max|               5.0|
+-------+------------------+

+-------+------------------+
|summary|            Grades|
+-------+------------------+
|  count|                10|
|   mean|              2.96|
| stddev|1.0479609831583532|
|    min|               1.8|
|    max|               4.8|
+-------+------------------+



# Perform Linear Regression

In [8]:
#Import Linear Regression class called LinearRegression
from pyspark.ml.regression import LinearRegression

#Create the Linear Regression object named having feature column as features and Label column as Time_to_Study
LinReg = LinearRegression(featuresCol="features", labelCol="Grades")

#Train the model on the training using fit() method.
model = LinReg.fit(train_dataset)

#Predict the Grades using the evulate method
pred = model.evaluate(test_dataset)

#Show the predicted Grade values along side actual Grade values
pred.predictions.show()

23/07/10 10:30:04 WARN Instrumentation: [69e2434a] regParam is zero, which might cause numerical instability and overfitting.
+--------+------+------------------+
|features|Grades|        prediction|
+--------+------+------------------+
|   [2.0]|   1.8| 1.828686653477401|
|   [2.0]|   1.8| 1.828686653477401|
|   [3.0]|   2.1| 2.101362045350784|
|   [4.0]|   2.4| 2.374037437224167|
|   [5.0]|   2.7|  2.64671282909755|
|   [6.0]|   2.9| 2.919388220970933|
|   [7.0]|   3.1|3.1920636128443163|
|   [8.0]|   3.5|3.4647390047176994|
|  [12.0]|   4.5| 4.555440572211232|
|  [13.0]|   4.8| 4.828115964084615|
+--------+------+------------------+



In [9]:
#Find out coefficient value
coefficient = model.coefficients
print ("The coefficient of the model is : %a" %coefficient)

#Find out intercept Value
intercept = model.intercept
print ("The Intercept of the model is : %f" %intercept)

#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
from pyspark.ml.evaluation import RegressionEvaluator
evaluation = RegressionEvaluator(labelCol="Grades", predictionCol="prediction")
print(evaluation)

# Root Mean Square Error
rmse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "rmse"})
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = evaluation.evaluate(pred.predictions, {evaluation.metricName: "r2"})
print("r2: %.3f" %r2)

The coefficient of the model is : DenseVector([0.2727])
The Intercept of the model is : 1.283336
RegressionEvaluator_6d1fd336fcfa
RMSE: 0.044
MSE: 0.002
MAE: 0.037
r2: 0.998


In [10]:
#Create Unlabeled dataset  to contain only feature column
unlabeled_dataset = test_dataset.select('features')

#Display the content of unlabeled_dataset
unlabeled_dataset.show()

+--------+
|features|
+--------+
|   [2.0]|
|   [2.0]|
|   [3.0]|
|   [4.0]|
|   [5.0]|
|   [6.0]|
|   [7.0]|
|   [8.0]|
|  [12.0]|
|  [13.0]|
+--------+



In [11]:
#Predict the model output for fresh & unseen test data using transform() method
new_predictions = model.transform(unlabeled_dataset)

In [12]:
#Display the new prediction values
new_predictions.show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|   [2.0]| 1.828686653477401|
|   [2.0]| 1.828686653477401|
|   [3.0]| 2.101362045350784|
|   [4.0]| 2.374037437224167|
|   [5.0]|  2.64671282909755|
|   [6.0]| 2.919388220970933|
|   [7.0]|3.1920636128443163|
|   [8.0]|3.4647390047176994|
|  [12.0]| 4.555440572211232|
|  [13.0]| 4.828115964084615|
+--------+------------------+



In [17]:
Predict_Grade = LinReg.prediction(10)
help(LinearRegression)

AttributeError: 'LinearRegression' object has no attribute 'prediction'

In [18]:
help(LinReg)

Help on LinearRegression in module pyspark.ml.regression object:

class LinearRegression(_JavaRegressor, _LinearRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LinearRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, standardization: bool = True, solver: str = 'auto', weightCol: Optional[str] = None, aggregationDepth: int = 2, loss: str = 'squaredError', epsilon: float = 1.35, maxBlockSizeInMB: float = 0.0)
 |  
 |  Linear regression.
 |  
 |  The learning objective is to minimize the specified loss function, with regularization.
 |  This supports two kinds of loss:
 |  
 |  * squaredError (a.k.a squared loss)
 |  * huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training dat