#Random Forest Regression for Likes

This notebook will train a vanilla Random Forest Regression model in order to predict the number of `likes` using the numbers of `views`, `comments`, `dislikes` and the `category_id` of each video.

In [None]:
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.matching
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._

import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer

In [None]:
val country = "US"
val usDF = spark.read.format("csv").option("header", "true")
                .load("new" + country + "videos.csv")
usDF.printSchema()

In [None]:
val usDF1 = usDF.select($"category_id"
                        ,$"comment_count",$"dislikes",$"views",$"likes")
                        .na.drop()


usDF1.printSchema()

val usDF2 = usDF1.withColumn("category_id",col("category_id").cast(DoubleType))
    .withColumn("comment_count",col("comment_count").cast(IntegerType))
    .withColumn("dislikes",col("dislikes").cast(IntegerType))
    .withColumn("views",col("views").cast(IntegerType))
    .withColumn("likes",col("likes").cast(IntegerType))
usDF2.show(5)

In [None]:
val numNan = usDF.count - usDF1.count

In [None]:
val assembler = new VectorAssembler()
                .setInputCols(Array("comment_count",
                                    "dislikes","views",
                                    "category_id"))
                .setOutputCol("features")
                .transform(usDF2)
usDF2.printSchema()

In [None]:
assembler.select($"likes",$"features").show(5)

In [None]:
val normalizer= new Normalizer()
                .setInputCol("features")
                .setOutputCol("normfeatures")
                .setP(2.0)
                .transform(assembler)
normalizer.show(5)
normalizer.printSchema()

In [None]:
val featureIndexer = new VectorIndexer()
  .setInputCol("likes")
  .setOutputCol("normfeatures")
  .setMaxCategories(4)
  .fit(data)

In [None]:
val Array(trainingData,testData)= normalizer.randomSplit(Array(0.7,0.3))

In [None]:
val rf = new RandomForestRegressor()
  .setLabelCol("likes")
  .setFeaturesCol("normfeatures")

// Chain indexer and forest in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(featureIndexer, rf))

In [None]:
// Train model. This also runs the indexer.
val rfModel = pipeline.fit(trainingData)

In [None]:
val resultDF = rfModel.transform(testData)
            .select("likes", "prediction")
resultDF.show(20)

In [None]:
val trainingSummary = rfModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
resultDF.describe().show