<a href="https://colab.research.google.com/github/Psancs05/dataset/blob/main/SparkApplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing and configuring Spark


In [None]:
!apt-get update -qq

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz

In [None]:
!tar xf /content/spark-3.2.0-bin-hadoop3.2.tgz

In [None]:
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
spark

##Random Forest Regressor Application

pySpark and ML Dependencies

In [None]:
import numpy as np

from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, HashingTF, Tokenizer
from pyspark.ml.feature import VectorAssembler

In [None]:
!wget -q https://raw.githubusercontent.com/Psancs05/dataset/main/Iris.csv


Understanding the data

In [None]:
df_raw = spark.read.csv("Iris.csv", header=True, inferSchema=True)
df_raw.show(5)
df_raw.count()

In [None]:
df_raw = df_raw.select("Species","SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm")
df = df_raw.withColumnRenamed("Species","index")
df.show(5)

In [None]:
featureCols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
df2 = assembler.transform(df)
df2.show(5)

In [None]:
print('Unique species')
df2.select('index').distinct().show()

Split data into train and test

In [None]:
print("Total DF: ", df2.count())

train, test = df2.randomSplit(weights=[0.8,0.2], seed=200)

print("Ejemplos usados para entrenar: ", train.count())
print("Ejemplos usados para test: ", test.count())

train = train.na.fill("")
test = test.na.fill("")

Data preproccess and model

In [None]:
labelIndexer = StringIndexer(inputCol="index", outputCol="indexedLabel").fit(df2)
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

pipeline = Pipeline(stages=[labelIndexer, rf])

In [None]:
model = pipeline.fit(train)

In [None]:
test.show(20)

Predictions

In [None]:
predictionsDf = model.transform(test)
predictionsDf.select("indexedLabel", "prediction").describe().show()

In [None]:
predictionsDf.show(10)

In [None]:
print('Label-value relationship')
predictionsDf.select('index', 'indexedLabel').distinct().show()

Model evaluation

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
accuracy = evaluator.evaluate(predictionsDf)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

In [None]:
predictionsDf.createOrReplaceTempView("Predictions")
true_setosa = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 0 AND indexedLabel = 0")
true_versicolor = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 1 AND indexedLabel = 1")
true_virginica = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 2 AND indexedLabel = 2")
false_setosa = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 0 AND (indexedLabel = 1 OR indexedLabel = 2)")
false_versicolor = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 1 AND (indexedLabel = 0 OR indexedLabel = 2)")
false_virginica = spark.sql("SELECT count(*) AS cnt FROM Predictions WHERE prediction = 2 AND (indexedLabel = 1 OR indexedLabel = 0)")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

labels = ['true setosa', 'true versicolor', 'true virginica', 'false setosa', 'false versicolor', 'false virginica']
sizes = [np.array(true_setosa.select('cnt').collect()), np.array(true_versicolor.select('cnt').collect()),\
         np.array(true_virginica.select('cnt').collect()), np.array(false_setosa.select('cnt').collect()), \
         np.array(false_versicolor.select('cnt').collect()), np.array(false_virginica.select('cnt').collect())]
colors = ['blue', 'red', 'green', 'yellow', 'grey', 'pink']
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors)
plt.axis('equal')