# Pipeline Example

In [2]:
# First let's create our PySpark instance
# import findspark
# findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("Review2").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark
# Click the hyperlinked "Spark UI" link to view details about your Spark session

You are working with 1 core(s)


In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

## Read in Dataframe

#### Context

What makes us, humans, able to tell apart two songs of different genres? Maybe you have
ever been in the diffcult situation to explain show it sounds the music style that you like
to someone. Then, could an automatic genre classifcation be possible?

#### Content

Each row is an electronic music song. The dataset contains 100 song for each genre among 23 electronic music genres, they were the top (100) songs of their genres on November 2016. The 71 columns are audio features extracted of a two random minutes sample of the file audio. These features have been extracted using pyAudioAnalysis (https://github.com/tyiannak/pyAudioAnalysis).

**Source:** https://www.kaggle.com/caparrini/beatsdataset

In [7]:
# Prepare training documents from a list of (id, text, label) tuples.
# I don't want to focus as much on the data for this lecture 
training = spark.createDataFrame([
    (0, "The dog barks loudly", 1.0),
    (1, "The cat is really grouchy", 0.0),
    (2, "The bear is so sleepy", 1.0),
    (3, "Edward is super excited", 0.0)
], ["id", "text", "label"])

training.show(truncate=False)

+---+-------------------------+-----+
|id |text                     |label|
+---+-------------------------+-----+
|0  |The dog barks loudly     |1.0  |
|1  |The cat is really grouchy|0.0  |
|2  |The bear is so sleepy    |1.0  |
|3  |Edward is super excited  |0.0  |
+---+-------------------------+-----+



In [8]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

# And here is where we put it all together in a pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [9]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "The dog is really big"),
    (5, "The sheep is so soft"),
    (6, "Kyle is very tall"),
    (7, "Alex is very smart")
], ["id", "text"])

In [10]:
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
    # $example off$

(4, The dog is really big) --> prob=[0.6112100540187116,0.3887899459812884], prediction=0.000000
(5, The sheep is so soft) --> prob=[0.11308869611534114,0.8869113038846589], prediction=1.000000
(6, Kyle is very tall) --> prob=[0.8330182569000051,0.16698174309999494], prediction=0.000000
(7, Alex is very smart) --> prob=[0.8330182569000051,0.16698174309999494], prediction=0.000000
