In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
df = spark.read.csv("dbfs:/FileStore/IRIS.csv",header=True,inferSchema=True)
display(df)

sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa


In [0]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [0]:
species_indexer = StringIndexer(inputCol="species", outputCol="speciesIndex")
df_string_indexed = species_indexer.fit(df).transform(df)


features = df.columns[:-1]
vect_assembler = VectorAssembler(inputCols = features, outputCol="features")
data_vector_assembled = vect_assembler.transform(df_string_indexed)
final_data = data_vector_assembled.select("features","speciesIndex")
display(final_data)

features,speciesIndex
"Map(vectorType -> dense, length -> 4, values -> List(5.1, 3.5, 1.4, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.0, 1.4, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.7, 3.2, 1.3, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.1, 1.5, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.6, 1.4, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.9, 1.7, 0.4))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.4, 1.4, 0.3))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.4, 1.5, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.4, 2.9, 1.4, 0.2))",0.0
"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))",0.0


In [0]:
train_dataset, test_dataset = final_data.randomSplit([0.8, 0.2],seed=42)

rf = RandomForestClassifier(labelCol="speciesIndex",featuresCol="features")
model = rf.fit(train_dataset)

In [0]:
predictions = model.transform(test_dataset)
display(predictions.select("prediction"))

prediction
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
