## Setting up Colab

In [1]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [2]:
import findspark
findspark.init()

In [3]:
# Create a PySpark session
from collections.abc import MutableMapping
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# !wget http://raw-recipes-clean-upgrad.s3.amazonaws.com/RAW_recipes_cleaned.csv

# df = spark.read.csv('RAW_recipes_cleaned.csv', inferSchema = True, header = True)
# df.show()

In [4]:
#Upload Student_Grades_Data.csv file from local system to remote colab location
from google.colab import files
files.upload()

Saving Student_Grades_Data.csv to Student_Grades_Data.csv


{'Student_Grades_Data.csv': b'Time_to_Study,Grades\n1,1.5\n5,2.7\n7,3.1\n3,2.1\n2,1.8\n9,3.9\n6,2.9\n12,4.5\n11,4.3\n2,1.8\n4,2.4\n8,3.5\n13,4.8\n9,3.9\n14,5\n10,4.1\n6,2.9\n12,4.5\n1,1.5\n4,2.4\n14,5\n10,4.1\n11,4.3\n4,2.4\n5,2.7\n8,3.5\n1,1.5\n2,1.8\n3,2.1\n7,3.1\n8,3.5\n14,5\n7,3.1\n8,3.5\n1,1.5\n2,1.8\n3,2.1\n4,2.4\n5,2.7\n6,2.9\n7,3.1\n8,3.5\n9,3.9\n10,4.1\n11,4.3\n12,4.5\n13,4.8\n14,5\n8,3.5\n2,1.8\n'}

## Reading Dataset

In [5]:
#Loading the Student_Grades_Data.csv file, uploaded in previous step
data = spark.read.csv('Student_Grades_Data.csv', header=True, inferSchema=True)

In [6]:
data.printSchema()

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)



In [7]:
#Display first few rows of data
data.show()

+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|            1|   1.5|
|            5|   2.7|
|            7|   3.1|
|            3|   2.1|
|            2|   1.8|
|            9|   3.9|
|            6|   2.9|
|           12|   4.5|
|           11|   4.3|
|            2|   1.8|
|            4|   2.4|
|            8|   3.5|
|           13|   4.8|
|            9|   3.9|
|           14|   5.0|
|           10|   4.1|
|            6|   2.9|
|           12|   4.5|
|            1|   1.5|
|            4|   2.4|
+-------------+------+
only showing top 20 rows



## Implementing Linear regression using Spark MLLib library

In [8]:
#Create a Feature array by omitting the last column
feature_cols = data.columns[:-1]
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")

In [9]:
#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(data)

In [10]:

#Display the data having additional column named features. Had it been multiple linear regression problem, you could see all the
# independent variable values combined in one list
data_w_features.show()

+-------------+------+--------+
|Time_to_Study|Grades|features|
+-------------+------+--------+
|            1|   1.5|   [1.0]|
|            5|   2.7|   [5.0]|
|            7|   3.1|   [7.0]|
|            3|   2.1|   [3.0]|
|            2|   1.8|   [2.0]|
|            9|   3.9|   [9.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|           11|   4.3|  [11.0]|
|            2|   1.8|   [2.0]|
|            4|   2.4|   [4.0]|
|            8|   3.5|   [8.0]|
|           13|   4.8|  [13.0]|
|            9|   3.9|   [9.0]|
|           14|   5.0|  [14.0]|
|           10|   4.1|  [10.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|            1|   1.5|   [1.0]|
|            4|   2.4|   [4.0]|
+-------------+------+--------+
only showing top 20 rows



In [11]:
#Select only Features and Label from previous dataset as we need these two entities for building machine learning model
finalized_data = data_w_features.select("features","Grades")

finalized_data.show()

+--------+------+
|features|Grades|
+--------+------+
|   [1.0]|   1.5|
|   [5.0]|   2.7|
|   [7.0]|   3.1|
|   [3.0]|   2.1|
|   [2.0]|   1.8|
|   [9.0]|   3.9|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|  [11.0]|   4.3|
|   [2.0]|   1.8|
|   [4.0]|   2.4|
|   [8.0]|   3.5|
|  [13.0]|   4.8|
|   [9.0]|   3.9|
|  [14.0]|   5.0|
|  [10.0]|   4.1|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|   [1.0]|   1.5|
|   [4.0]|   2.4|
+--------+------+
only showing top 20 rows



In [12]:
#Split the data into training and test model with 70% obs. going in training and 30% in testing
train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3])

In [13]:
#Import Linear Regression class called LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor

#Create the Linear Regression object named having feature column as features and Label column as Time_to_Study
DecReg = DecisionTreeRegressor(featuresCol="features", labelCol="Grades")

#Train the model on the training using fit() method.
model = DecReg.fit(train_dataset)

In [16]:
#Predict the Grades using the evulate method
pred = model.transform(test_dataset)

# Show the predicted Grade values alongside actual Grade values
pred.select("Grades", "prediction").show()

+------+-----------------+
|Grades|       prediction|
+------+-----------------+
|   1.5|              1.5|
|   1.5|              1.5|
|   1.8|              1.8|
|   2.1|              2.1|
|   2.4|              2.4|
|   2.9|              2.9|
|   4.1|3.900000000000001|
|   4.1|3.900000000000001|
|   4.1|3.900000000000001|
|   4.3|              4.3|
|   4.5|4.500000000000001|
|   5.0|5.000000000000001|
+------+-----------------+



In [20]:
# Evaluate the model using metrics like RMSE, MAE, and R-Square
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="Grades", predictionCol="prediction")

# Root Mean Square Error
rmse = evaluator.evaluate(pred, {evaluator.metricName: "rmse"})
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = evaluator.evaluate(pred, {evaluator.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = evaluator.evaluate(pred, {evaluator.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - Coefficient of determination
r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})
print("r2: %.3f" % r2)

RMSE: 0.100
MSE: 0.010
MAE: 0.050
r2: 0.993
