## Spark MLlib Pipeline Example
## This notebook demonstrates how to create a pipeline using Spark MLlib.

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

spark = SparkSession.builder.appName("MLPipeline").getOrCreate()

In [0]:
# Load sample 
data = spark.createDataFrame([(0, "a", 1.0), (1, "b", 2.0), (0, "a", 3.0), (1, "b", 4.0), (0, "a", 5.0)], ["label", "category", "value"])
data.show()

+-----+--------+-----+
|label|category|value|
+-----+--------+-----+
|    0|       a|  1.0|
|    1|       b|  2.0|
|    0|       a|  3.0|
|    1|       b|  4.0|
|    0|       a|  5.0|
+-----+--------+-----+



In [0]:
# Define stages
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
assembler = VectorAssembler(inputCols=["categoryIndex", "value"], outputCol="features")
classifier = DecisionTreeClassifier(featuresCol="features", labelCol="label")

In [0]:
# Build pipeline
pipeline = Pipeline(stages=[indexer, assembler, classifier])
model = pipeline.fit(data)
predictions = model.transform(data)
predictions.select("label", "features", "prediction").show()

+-----+---------+----------+
|label| features|prediction|
+-----+---------+----------+
|    0|[0.0,1.0]|       0.0|
|    1|[1.0,2.0]|       1.0|
|    0|[0.0,3.0]|       0.0|
|    1|[1.0,4.0]|       1.0|
|    0|[0.0,5.0]|       0.0|
+-----+---------+----------+

