# **🚀 Machine Learning Tuning**

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.\
        builder.\
        appName("linear-regression-spark").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

22/12/12 19:56:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
training = spark.createDataFrame(
    [
        (0, "a b c d e spark", 1.0),
        (1, "b d", 0.0),
        (2, "spark f g h", 1.0),
        (3, "hadoop mapreduce", 0.0),
        (4, "b spark who", 1.0),
        (5, "g d a y", 0.0),
        (6, "spark fly", 1.0),
        (7, "was mapreduce", 0.0),
        (8, "e spark program", 1.0),
        (9, "a e c l", 0.0),
        (10, "spark compile", 1.0),
        (11, "hadoop software", 0.0)
    ], 
    ["id", "text", "label"]
)

In [11]:
training.show(2)

+---+---------------+-----+
| id|           text|label|
+---+---------------+-----+
|  0|a b c d e spark|  1.0|
|  1|            b d|  0.0|
+---+---------------+-----+
only showing top 2 rows



In [4]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(
    inputCol="text", 
    outputCol="words",
)
hashingTF = HashingTF(
    inputCol=tokenizer.getOutputCol(), 
    outputCol="features",
)

lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(
    stages=[
        tokenizer, 
        hashingTF, 
        lr,
    ],
)

In [19]:
words = tokenizer.transform(training)
words.show(2)

hashingTF.transform(words).show(2)

+---+---------------+-----+--------------------+
| id|           text|label|               words|
+---+---------------+-----+--------------------+
|  0|a b c d e spark|  1.0|[a, b, c, d, e, s...|
|  1|            b d|  0.0|              [b, d]|
+---+---------------+-----+--------------------+
only showing top 2 rows

+---+---------------+-----+--------------------+--------------------+
| id|           text|label|               words|            features|
+---+---------------+-----+--------------------+--------------------+
|  0|a b c d e spark|  1.0|[a, b, c, d, e, s...|(262144,[74920,89...|
|  1|            b d|  0.0|              [b, d]|(262144,[89530,14...|
+---+---------------+-----+--------------------+--------------------+
only showing top 2 rows



In [5]:
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [6]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(),
    numFolds=3,
)

cvModel = crossval.fit(training)

22/12/12 12:37:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/12 12:37:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [7]:
# Prepare test documents, which are unlabeled.
test = spark.createDataFrame(
    [
        (4, "spark i j k"),
        (5, "l m n"),
        (6, "mapreduce spark"),
        (7, "apache hadoop"),
    ], 
    ["id", "text"]
)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.3413, 0.6587]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.9438, 0.0562]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.3451, 0.6549]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.9561, 0.0439]), prediction=0.0)


In [2]:
spark.stop()