## Setting up Colab

In [1]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [2]:
import findspark
findspark.init()

In [3]:
# Create a PySpark session
from collections.abc import MutableMapping
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# !wget http://raw-recipes-clean-upgrad.s3.amazonaws.com/RAW_recipes_cleaned.csv

# df = spark.read.csv('RAW_recipes_cleaned.csv', inferSchema = True, header = True)
# df.show()

In [4]:
#Upload iris.csv file from local system to remote colab location
from google.colab import files
files.upload()

Saving Iris.csv to Iris.csv


{'Iris.csv': b'Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species\n1,5.1,3.5,1.4,0.2,Iris-setosa\n2,4.9,3.0,1.4,0.2,Iris-setosa\n3,4.7,3.2,1.3,0.2,Iris-setosa\n4,4.6,3.1,1.5,0.2,Iris-setosa\n5,5.0,3.6,1.4,0.2,Iris-setosa\n6,5.4,3.9,1.7,0.4,Iris-setosa\n7,4.6,3.4,1.4,0.3,Iris-setosa\n8,5.0,3.4,1.5,0.2,Iris-setosa\n9,4.4,2.9,1.4,0.2,Iris-setosa\n10,4.9,3.1,1.5,0.1,Iris-setosa\n11,5.4,3.7,1.5,0.2,Iris-setosa\n12,4.8,3.4,1.6,0.2,Iris-setosa\n13,4.8,3.0,1.4,0.1,Iris-setosa\n14,4.3,3.0,1.1,0.1,Iris-setosa\n15,5.8,4.0,1.2,0.2,Iris-setosa\n16,5.7,4.4,1.5,0.4,Iris-setosa\n17,5.4,3.9,1.3,0.4,Iris-setosa\n18,5.1,3.5,1.4,0.3,Iris-setosa\n19,5.7,3.8,1.7,0.3,Iris-setosa\n20,5.1,3.8,1.5,0.3,Iris-setosa\n21,5.4,3.4,1.7,0.2,Iris-setosa\n22,5.1,3.7,1.5,0.4,Iris-setosa\n23,4.6,3.6,1.0,0.2,Iris-setosa\n24,5.1,3.3,1.7,0.5,Iris-setosa\n25,4.8,3.4,1.9,0.2,Iris-setosa\n26,5.0,3.0,1.6,0.2,Iris-setosa\n27,5.0,3.4,1.6,0.4,Iris-setosa\n28,5.2,3.5,1.5,0.2,Iris-setosa\n29,5.2,3.4,1.4,0.2,Iris-setosa\n

## Reading Dataset

In [5]:
#Loading the Student_Grades_Data.csv file, uploaded in previous step
data = spark.read.csv('Iris.csv', header=True, inferSchema=True)

In [6]:
data.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [7]:
#Display first few rows of data
data.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [8]:
# Encode the label column (species) into numeric format
from pyspark.ml.feature import StringIndexer
label_indexer = StringIndexer(inputCol="Species", outputCol="label")
data = label_indexer.fit(data).transform(data)

In [9]:
data = data.drop('Species', 'Id')
data.show()

+-------------+------------+-------------+------------+-----+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|label|
+-------------+------------+-------------+------------+-----+
|          5.1|         3.5|          1.4|         0.2|  0.0|
|          4.9|         3.0|          1.4|         0.2|  0.0|
|          4.7|         3.2|          1.3|         0.2|  0.0|
|          4.6|         3.1|          1.5|         0.2|  0.0|
|          5.0|         3.6|          1.4|         0.2|  0.0|
|          5.4|         3.9|          1.7|         0.4|  0.0|
|          4.6|         3.4|          1.4|         0.3|  0.0|
|          5.0|         3.4|          1.5|         0.2|  0.0|
|          4.4|         2.9|          1.4|         0.2|  0.0|
|          4.9|         3.1|          1.5|         0.1|  0.0|
|          5.4|         3.7|          1.5|         0.2|  0.0|
|          4.8|         3.4|          1.6|         0.2|  0.0|
|          4.8|         3.0|          1.4|         0.1|  0.0|
|       

## Implementing Linear regression using Spark MLLib library

In [10]:
#Create a Feature array by omitting the last column
feature_cols = data.columns[:-1]
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")

In [11]:
#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(data)

In [12]:
#Display the data having additional column named features. Had it been multiple linear regression problem, you could see all the
# independent variable values combined in one list
data_w_features.show()

+-------------+------------+-------------+------------+-----+-----------------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|label|         features|
+-------------+------------+-------------+------------+-----+-----------------+
|          5.1|         3.5|          1.4|         0.2|  0.0|[5.1,3.5,1.4,0.2]|
|          4.9|         3.0|          1.4|         0.2|  0.0|[4.9,3.0,1.4,0.2]|
|          4.7|         3.2|          1.3|         0.2|  0.0|[4.7,3.2,1.3,0.2]|
|          4.6|         3.1|          1.5|         0.2|  0.0|[4.6,3.1,1.5,0.2]|
|          5.0|         3.6|          1.4|         0.2|  0.0|[5.0,3.6,1.4,0.2]|
|          5.4|         3.9|          1.7|         0.4|  0.0|[5.4,3.9,1.7,0.4]|
|          4.6|         3.4|          1.4|         0.3|  0.0|[4.6,3.4,1.4,0.3]|
|          5.0|         3.4|          1.5|         0.2|  0.0|[5.0,3.4,1.5,0.2]|
|          4.4|         2.9|          1.4|         0.2|  0.0|[4.4,2.9,1.4,0.2]|
|          4.9|         3.1|          1.

In [13]:
#Select only Features and Label from previous dataset as we need these two entities for building machine learning model
finalized_data = data_w_features.select("features","label")

finalized_data.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
|[4.3,3.0,1.1,0.1]|  0.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.7,4.4,1.5,0.4]|  0.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[5.1,3.5,1.4,0.3]|  0.0|
|[5.7,3.8,1.7,0.3]|  0.0|
|[5.1,3.8,1.5,0.3]|  0.0|
+-----------------+-----+
only showing top 20 rows



In [14]:
#Split the data into training and test model with 70% obs. going in training and 30% in testing
train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3])

In [15]:
#Import Linear Regression class called LinearRegression
from pyspark.ml.classification  import RandomForestClassifier

#Create the Linear Regression object named having feature column as features and Label column as Time_to_Study
rand = RandomForestClassifier(featuresCol="features", labelCol="label")

#Train the model on the training using fit() method.
model = rand.fit(train_dataset)

In [16]:
#Predict the Grades using the evulate method
pred = model.transform(test_dataset)

# Show the predicted labels alongside actual labels
pred.select("label", "prediction", "features").show()

+-----+----------+-----------------+
|label|prediction|         features|
+-----+----------+-----------------+
|  0.0|       0.0|[4.4,3.2,1.3,0.2]|
|  0.0|       0.0|[4.5,2.3,1.3,0.3]|
|  0.0|       0.0|[4.6,3.1,1.5,0.2]|
|  0.0|       0.0|[4.8,3.0,1.4,0.1]|
|  0.0|       0.0|[4.8,3.0,1.4,0.3]|
|  0.0|       0.0|[4.8,3.4,1.6,0.2]|
|  0.0|       0.0|[4.9,3.1,1.5,0.1]|
|  1.0|       1.0|[5.0,2.3,3.3,1.0]|
|  0.0|       0.0|[5.0,3.2,1.2,0.2]|
|  0.0|       0.0|[5.0,3.4,1.5,0.2]|
|  0.0|       0.0|[5.1,3.3,1.7,0.5]|
|  0.0|       0.0|[5.3,3.7,1.5,0.2]|
|  1.0|       1.0|[5.4,3.0,4.5,1.5]|
|  0.0|       0.0|[5.4,3.4,1.5,0.4]|
|  0.0|       0.0|[5.4,3.4,1.7,0.2]|
|  0.0|       0.0|[5.4,3.9,1.3,0.4]|
|  0.0|       0.0|[5.4,3.9,1.7,0.4]|
|  1.0|       1.0|[5.5,2.3,4.0,1.3]|
|  1.0|       1.0|[5.5,2.5,4.0,1.3]|
|  0.0|       0.0|[5.5,3.5,1.3,0.2]|
+-----+----------+-----------------+
only showing top 20 rows



In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the model using metrics like Accuracy, Precision, Recall, etc.
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Accuracy
accuracy = evaluator.evaluate(pred, {evaluator.metricName: "accuracy"})
print("Accuracy: %.3f" % accuracy)

# Precision
precision = evaluator.evaluate(pred, {evaluator.metricName: "weightedPrecision"})
print("Precision: %.3f" % precision)

# Recall
recall = evaluator.evaluate(pred, {evaluator.metricName: "weightedRecall"})
print("Recall: %.3f" % recall)

# F1 Score
f1 = evaluator.evaluate(pred, {evaluator.metricName: "f1"})
print("F1 Score: %.3f" % f1)

"""## Summary"""

# Display the Decision Tree structure
print("Learned Random Tree Model:\n", model.toDebugString)

Accuracy: 0.962
Precision: 0.962
Recall: 0.962
F1 Score: 0.962
Learned Random Tree Model:
 RandomForestClassificationModel: uid=RandomForestClassifier_ef9ac7ba5b0c, numTrees=20, numClasses=3, numFeatures=4
  Tree 0 (weight 1.0):
    If (feature 3 <= 0.8)
     Predict: 0.0
    Else (feature 3 > 0.8)
     If (feature 3 <= 1.55)
      If (feature 2 <= 4.95)
       Predict: 1.0
      Else (feature 2 > 4.95)
       Predict: 2.0
     Else (feature 3 > 1.55)
      If (feature 2 <= 4.85)
       Predict: 1.0
      Else (feature 2 > 4.85)
       Predict: 2.0
  Tree 1 (weight 1.0):
    If (feature 2 <= 2.45)
     Predict: 0.0
    Else (feature 2 > 2.45)
     If (feature 1 <= 2.95)
      If (feature 3 <= 1.55)
       If (feature 1 <= 2.6500000000000004)
        If (feature 1 <= 2.55)
         Predict: 1.0
        Else (feature 1 > 2.55)
         Predict: 2.0
       Else (feature 1 > 2.6500000000000004)
        Predict: 1.0
      Else (feature 3 > 1.55)
       Predict: 2.0
     Else (feature 1 > 2.