In [1]:
// imports

import java.io.File
import com.esotericsoftware.kryo.io.Output
import com.truecar.mleap.runtime.estimator._
import com.truecar.mleap.serialization.ml.v1.MlJsonSerializer
import com.truecar.mleap.runtime.transformer.Transformer
import com.truecar.mleap.spark.MleapSparkSupport._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import ml.bundle.fs.DirectoryBundle

In [4]:
// Step 1. Load our Airbnb dataset

val inputFile = "file:///tmp/airbnb"
val outputFileRf = "/tmp/transformer.rf.ml"
val outputFileLr = "/tmp/transformer.lr.ml"

var dataset = sqlContext.read.format("com.databricks.spark.avro").
load(inputFile)

In [5]:
// Step 2. Create our feature pipeline and train it on the entire dataset
val continuousFeatures = Array("bathrooms",
  "bedrooms",
  "security_deposit",
  "cleaning_fee",
  "extra_people",
  "number_of_reviews",
  "review_scores_rating")
val categoricalFeatures = Array("room_type",
  "host_is_superhost",
  "cancellation_policy",
  "instant_bookable")
val allFeatures = continuousFeatures.union(categoricalFeatures)

// Filter all null values
val allCols = allFeatures.union(Seq("price")).map(dataset.col)
val nullFilter = allCols.map(_.isNotNull).reduce(_ && _)
dataset = dataset.select(allCols: _*).filter(nullFilter).persist()
val Array(trainingDataset, validationDataset) = dataset.randomSplit(Array(0.7, 0.3))

val continuousFeatureAssembler = VectorAssemblerEstimator(inputCols = continuousFeatures,
  outputCol = "unscaled_continuous_features")
val continuousFeatureScaler = StandardScalerEstimator(inputCol = "unscaled_continuous_features",
  outputCol = "scaled_continuous_features")

val categoricalFeatureIndexers = categoricalFeatures.map {
  feature => StringIndexerEstimator(inputCol = feature,
    outputCol = s"${feature}_index")
}

val featureCols = categoricalFeatureIndexers.map(_.outputCol).union(Seq("scaled_continuous_features"))
val featureAssembler = VectorAssemblerEstimator(inputCols = featureCols,
  outputCol = "features")
val estimators = Seq(continuousFeatureAssembler, continuousFeatureScaler).
  union(categoricalFeatureIndexers).
  union(Seq(featureAssembler))
val featurePipeline = PipelineEstimator(estimators = estimators)
val sparkFeaturePipelineModel = featurePipeline.sparkEstimate(dataset)


In [6]:
// Step 3. Create our random forest model
val randomForest = RandomForestRegressionEstimator(featuresCol = "features",
  labelCol = "price",
  predictionCol = "price_prediction")

In [7]:
// Step 4. Create our random forest model
val linearRegression = LinearRegressionEstimator(featuresCol = "features",
  labelCol = "price",
  predictionCol = "price_prediction")

In [8]:
// Step 5. Assemble the final pipeline (random forest) by implicit conversion to MLeap models
val sparkPipelineEstimatorRf = new Pipeline().setStages(Array(sparkFeaturePipelineModel, randomForest))
val sparkPipelineRf = sparkPipelineEstimatorRf.fit(trainingDataset)
val mleapPipelineRf: Transformer = sparkPipelineRf

In [9]:
// Step 6. Assemble the final pipeline (linear regression) by implicit conversion to MLeap models
val sparkPipelineEstimatorLr = new Pipeline().setStages(Array(sparkFeaturePipelineModel, linearRegression))
val sparkPipelineLr = sparkPipelineEstimatorLr.fit(trainingDataset)
val mleapPipelineLr: Transformer = sparkPipelineLr

In [10]:
// Step 7. Save our MLeap pipeline to a directory
val mleapFileRf = new File(outputFileRf)
val mleapFileLr = new File(outputFileLr)

// if you want to save to S3
// val bundleWriter = S3BundleWriter(s3Path)
val bundleWriterRf = DirectoryBundle(mleapFileRf)
val bundleWriterLr = DirectoryBundle(mleapFileLr)

mleapFileRf.mkdirs()
mleapFileLr.mkdirs()

val serializer = MlJsonSerializer

serializer.serializeWithClass(mleapPipelineRf, bundleWriterRf)
serializer.serializeWithClass(mleapPipelineLr, bundleWriterLr)

In [None]:
// sbt "server/run /tmp/transformer.rf.ml 8080"
// sbt "server/run /tmp/transformer.lr.ml 8081"
// curl -v -XPOST \                                                                                                                                                                 ~ Hollins-MacBook-Pro
//   -H "content-type: application/json" \
//   -d @/Users/hollinwilkins/Workspace/scratch/frame.json http://localhost:8080/transform
// curl -v -XPOST \                                                                                                                                                                 ~ Hollins-MacBook-Pro
//   -H "content-type: application/json" \
//   -d @/Users/hollinwilkins/Workspace/scratch/frame.json http://localhost:8081/transform

In [None]:
/*
{
  "schema": {
    "fields": [{
      "name": "bathrooms",
      "dataType": "double"
    }, {
      "name": "bedrooms",
      "dataType": "double"
    }, {
      "name": "security_deposit",
      "dataType": "double"
    }, {
      "name": "cleaning_fee",
      "dataType": "double"
    }, {
      "name": "extra_people",
      "dataType": "double"
    }, {
      "name": "number_of_reviews",
      "dataType": "double"
    }, {
      "name": "review_scores_rating",
      "dataType": "double"
    }, {
      "name": "room_type",
      "dataType": "string"
    }, {
      "name": "host_is_superhost",
      "dataType": "string"
    }, {
      "name": "cancellation_policy",
      "dataType": "string"
    }, {
      "name": "instant_bookable",
      "dataType": "string"
    }]
  },
  "rows": [[2.0, 3.0, 50.0, 30.0, 2.0, 56.0, 90.0, "Entire home/apt", "1.0", "strict", "1.0"]]
}
*/