In [1]:
// imports

import java.io.File
import com.esotericsoftware.kryo.io.Output
import com.truecar.mleap.runtime.estimator._
import com.truecar.mleap.serialization.ml.v1.MlJsonSerializer
import com.truecar.mleap.runtime.transformer.Transformer
import com.truecar.mleap.spark.MleapSparkSupport._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import ml.bundle.fs.DirectoryBundle

In [2]:
// Step 1. Load our Airbnb dataset

val inputFile = "file://tmp/airbnb"
val outputFile = "/tmp/transformer.ml"

var dataset = sqlContext.read.format("com.databricks.spark.avro").
load(inputFile)

In [3]:
// Step 2. Create our feature pipeline and train it on the entire dataset
val continuousFeatures = Array("bathrooms",
  "bedrooms",
  "security_deposit",
  "cleaning_fee",
  "extra_people",
  "number_of_reviews",
  "review_scores_rating")
val categoricalFeatures = Array("room_type",
  "host_is_superhost",
  "cancellation_policy",
  "instant_bookable")
val allFeatures = continuousFeatures.union(categoricalFeatures)

// Filter all null values
val allCols = allFeatures.union(Seq("price")).map(dataset.col)
val nullFilter = allCols.map(_.isNotNull).reduce(_ && _)
dataset = dataset.select(allCols: _*).filter(nullFilter).persist()
val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.7, 0.3))

val continuousFeatureAssembler = VectorAssemblerEstimator(inputCols = continuousFeatures,
  outputCol = "unscaled_continuous_features")
val continuousFeatureScaler = StandardScalerEstimator(inputCol = "unscaled_continuous_features",
  outputCol = "scaled_continuous_features")

val categoricalFeatureIndexers = categoricalFeatures.map {
  feature => StringIndexerEstimator(inputCol = feature,
    outputCol = s"${feature}_index")
}

val featureCols = categoricalFeatureIndexers.map(_.outputCol).union(Seq("scaled_continuous_features"))
val featureAssembler = VectorAssemblerEstimator(inputCols = featureCols,
  outputCol = "features")
val estimators = Seq(continuousFeatureAssembler, continuousFeatureScaler).
  union(categoricalFeatureIndexers).
  union(Seq(featureAssembler))
val featurePipeline = PipelineEstimator(estimators = estimators)
val sparkFeaturePipelineModel = featurePipeline.sparkEstimate(dataset)


In [4]:
// Step 3. Create our random forest model
val randomForest = RandomForestRegressionEstimator(featuresCol = "features",
  labelCol = "price",
  predictionCol = "price_prediction")

In [5]:
// Step 4. Assemble the final pipeline by implicit conversion to MLeap models
val sparkPipelineEstimator = new Pipeline().setStages(Array(sparkFeaturePipelineModel, randomForest))
val sparkPipeline = sparkPipelineEstimator.fit(trainingDataset)
val mleapPipeline: Transformer = sparkPipeline

In [6]:
// Step 5. Save our MLeap pipeline to a directory
val mleapFile = new File(outputFile)
// if you want to save to S3
// val bundleWriter = S3BundleWriter(s3Path)
val bundleWriter = DirectoryBundle(mleapFile)
mleapFile.mkdirs()
val serializer = MlJsonSerializer
serializer.serializeWithClass(mleapPipeline, bundleWriter)