In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *


## create spark session
to use spark functionalities, we need to create spark session

In [43]:
sp = SparkSession.builder.appName('spark').getOrCreate()

## Read files
to read files we can use read method of the session we created


In [44]:
df = sp.read.csv('iris.csv')
df.show()

+---+-------------+------------+-------------+------------+-----------+
|_c0|          _c1|         _c2|          _c3|         _c4|        _c5|
+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-

to know strcuture and types of the dataset , we can use print schema

In [45]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



### properly loading the file
plain read method loads, but headers wont be handled, as you can see headers are counted as entry points. And types of data points are String by default.
To handle header, we can defin argument <b>header</b> as true, to load with proper datatype we need  <b>inferSchema</b>

In [46]:
df = sp.read.csv('iris.csv',header=True,inferSchema=True)

In [47]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



### using custom schema

In [48]:
schema = StructType([
    StructField('Id',IntegerType()),
    StructField('SepalLengthCm',DoubleType()),
    StructField('SepalWidthCm',DoubleType()),
    StructField('PetalLengthCm',DoubleType()),
    StructField('PetalWidthCm',DoubleType()),
    StructField('Species',StringType())
])

In [49]:
df = sp.read.csv('iris.csv',header=True,schema=schema)

In [50]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [51]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



## we can do ML using pyspark
we can choose our independent variables (feautures) and or target using vector assembler.

In [52]:
from pyspark.ml.feature import VectorAssembler
inputCol = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
vectorizer = VectorAssembler( inputCols=inputCol,
                                outputCol='features')
df = vectorizer.transform(df)


In [53]:
df.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|  9|     

#### as you can see, my feautures are in the form of array like object. still, I need to encode labels, because they are still in string format.

### preprocessing

In [55]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Species',outputCol='target')

In [56]:
df = indexer.fit(df).transform(df)

In [57]:
df.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|target|
+---+-------------+------------+-------------+------------+-----------+-----------------+------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|   0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|   0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|   0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|   0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|   0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|   0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|   0.0|
|  8|          5.0|         3.

### split into test and train
we can easily split into train and test using randomSplit method of dataframe object.

In [58]:
df_train,df_test = df.randomSplit([0.7,0.3],seed=1)

In [60]:
df_test.show()

+---+-------------+------------+-------------+------------+---------------+-----------------+------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|         features|target|
+---+-------------+------------+-------------+------------+---------------+-----------------+------+
|  5|          5.0|         3.6|          1.4|         0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|   0.0|
| 11|          5.4|         3.7|          1.5|         0.2|    Iris-setosa|[5.4,3.7,1.5,0.2]|   0.0|
| 17|          5.4|         3.9|          1.3|         0.4|    Iris-setosa|[5.4,3.9,1.3,0.4]|   0.0|
| 19|          5.7|         3.8|          1.7|         0.3|    Iris-setosa|[5.7,3.8,1.7,0.3]|   0.0|
| 21|          5.4|         3.4|          1.7|         0.2|    Iris-setosa|[5.4,3.4,1.7,0.2]|   0.0|
| 25|          4.8|         3.4|          1.9|         0.2|    Iris-setosa|[4.8,3.4,1.9,0.2]|   0.0|
| 29|          5.2|         3.4|          1.4|         0.2|    Iris-setosa|[5.2,3.4,1.4,0.2

## create a model - fit - predict - evaluate

In [63]:
from pyspark.ml.classification import RandomForestClassifier

In [66]:
rf_clf = RandomForestClassifier(featuresCol='features',labelCol='target')

In [68]:
rf_clf = rf_clf.fit(df_train)

In [69]:
df_test = rf_clf.transform(df_test)

In [70]:
df_test.show()

+---+-------------+------------+-------------+------------+---------------+-----------------+------+--------------------+--------------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|         features|target|       rawPrediction|         probability|prediction|
+---+-------------+------------+-------------+------------+---------------+-----------------+------+--------------------+--------------------+----------+
|  5|          5.0|         3.6|          1.4|         0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|   0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
| 11|          5.4|         3.7|          1.5|         0.2|    Iris-setosa|[5.4,3.7,1.5,0.2]|   0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
| 17|          5.4|         3.9|          1.3|         0.4|    Iris-setosa|[5.4,3.9,1.3,0.4]|   0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
| 19|          5.7|         3.8|          1.7|         0.3|    Iris-setosa|[

In [73]:
df_test.select('Species','features','target','probability','prediction').show()

+---------------+-----------------+------+--------------------+----------+
|        Species|         features|target|         probability|prediction|
+---------------+-----------------+------+--------------------+----------+
|    Iris-setosa|[5.0,3.6,1.4,0.2]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.4,3.7,1.5,0.2]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.4,3.9,1.3,0.4]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.7,3.8,1.7,0.3]|   0.0|     [0.85,0.15,0.0]|       0.0|
|    Iris-setosa|[5.4,3.4,1.7,0.2]|   0.0|     [0.95,0.05,0.0]|       0.0|
|    Iris-setosa|[4.8,3.4,1.9,0.2]|   0.0|     [0.8,0.15,0.05]|       0.0|
|    Iris-setosa|[5.2,3.4,1.4,0.2]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.2,4.1,1.5,0.1]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.5,3.5,1.3,0.2]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[4.4,3.0,1.3,0.2]|   0.0|       [1.0,0.0,0.0]|       0.0|
|    Iris-setosa|[5.1,3.4

### evaluate model preformance

In [75]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
criterion = MulticlassClassificationEvaluator(labelCol='target')
acc = criterion.evaluate(df_test)

In [76]:
acc

0.9559343434343436