In [1]:
from pyspark.sql import SparkSession

filepath = '/Users/williamtun/Documents/Code/DataEngineer/pyspark/iris.csv'

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').\
                load(filepath,header=True)

df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [5]:
df = df.withColumnRenamed("sepal.length","sepal_length") \
    .withColumnRenamed("sepal.width","sepal_width") \
    .withColumnRenamed("petal.length","petal_length") \
    .withColumnRenamed("petal.width","petal_width") 

# Binary Binizer

In [10]:
from pyspark.ml.feature import Binarizer

In [15]:
binarizer = Binarizer(threshold=4.8, inputCol="sepal_length", outputCol="binarized_feature")

In [16]:
binarizedDataFrame = binarizer.transform(df)

In [17]:
print("Binarizer output with Threshold = %f" % binarizer.getThreshold())

Binarizer output with Threshold = 4.800000


In [18]:
binarizedDataFrame.show()

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|binarized_feature|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|              1.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|              1.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|              0.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|              0.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|              1.0|
|         5.4|        3.9|         1.7|        0.4| Setosa|              1.0|
|         4.6|        3.4|         1.4|        0.3| Setosa|              0.0|
|         5.0|        3.4|         1.5|        0.2| Setosa|              1.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|              0.0|
|         4.9|        3.1|         1.5|        0.1| Setosa|     

# Buckets - Auto QuantileDiscretizer

In [2]:
from pyspark.ml.feature import QuantileDiscretizer

In [6]:
qds = QuantileDiscretizer(numBuckets=5, inputCol="sepal_length", outputCol="buckets",
                               relativeError=0.01, handleInvalid="error")

In [7]:
bucketizer = qds.fit(df)

In [8]:
bucketizer.transform(df).show()

+------------+-----------+------------+-----------+-------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|buckets|
+------------+-----------+------------+-----------+-------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|    1.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|    0.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|    0.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|    0.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|    1.0|
|         5.4|        3.9|         1.7|        0.4| Setosa|    1.0|
|         4.6|        3.4|         1.4|        0.3| Setosa|    0.0|
|         5.0|        3.4|         1.5|        0.2| Setosa|    1.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|    0.0|
|         4.9|        3.1|         1.5|        0.1| Setosa|    0.0|
|         5.4|        3.7|         1.5|        0.2| Setosa|    1.0|
|         4.8|        3.4|         1.6|        0

# Buckets - set thresholds manually

In [20]:
from pyspark.ml.feature import Bucketizer

In [21]:
splits = [-float("inf"),4.5, 4.8, 5.1, float("inf")]

In [22]:
split_bucketizer = Bucketizer(splits=splits, inputCol="sepal_length",outputCol="result").transform(df)

In [23]:
split_bucketizer.show()

+------------+-----------+------------+-----------+-------+------+
|sepal_length|sepal_width|petal_length|petal_width|variety|result|
+------------+-----------+------------+-----------+-------+------+
|         5.1|        3.5|         1.4|        0.2| Setosa|   3.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|   2.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|   1.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|   1.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|   2.0|
|         5.4|        3.9|         1.7|        0.4| Setosa|   3.0|
|         4.6|        3.4|         1.4|        0.3| Setosa|   1.0|
|         5.0|        3.4|         1.5|        0.2| Setosa|   2.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|   0.0|
|         4.9|        3.1|         1.5|        0.1| Setosa|   2.0|
|         5.4|        3.7|         1.5|        0.2| Setosa|   3.0|
|         4.8|        3.4|         1.6|        0.2| Setosa|   

# String indexer

In [27]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F

In [32]:
indexer = StringIndexer(inputCol="variety", outputCol="varietyIndex")
indexed = indexer.fit(df).transform(df)

In [38]:
indexed.groupBy('variety').agg(F.count('variety').alias('variety_count')).show()

+----------+-------------+
|   variety|variety_count|
+----------+-------------+
| Virginica|           50|
|    Setosa|           50|
|Versicolor|           50|
+----------+-------------+



In [39]:
indexed.groupBy('varietyIndex').agg(F.count('varietyIndex').alias('varietyIndex_count')).show()

+------------+------------------+
|varietyIndex|varietyIndex_count|
+------------+------------------+
|         0.0|                50|
|         1.0|                50|
|         2.0|                50|
+------------+------------------+

