In [None]:
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.matching
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer

In [None]:
val country = "US"
val labelCol = "likes"
val featCol1 = "comment_count"
val featCol2 = "category_id"
val featCol3 = "dislikes"
val featCol4 = "views"
val usDF = spark.read.format("csv").option("header", "true")
                .load("data/" + country + "videos_new.csv")

In [None]:
usDF.printSchema()

In [None]:
val usDF1 = usDF.select($"category_id",$"comment_count",
                        $"dislikes",$"views",$"likes")
                        .na.drop()


usDF1.printSchema()

val usDF2 = usDF1.withColumn("category_id",col("category_id").cast(DoubleType))
    .withColumn("comment_count",col("comment_count").cast(IntegerType))
    .withColumn("dislikes",col("dislikes").cast(IntegerType))
    .withColumn("views",col("views").cast(IntegerType))
    .withColumn("likes",col("likes").cast(IntegerType))
usDF2.show(5)

In [None]:
val numNan = usDF.count - usDF1.count

In [None]:
val assembler = new VectorAssembler()
                .setInputCols(Array(featCol1, featCol2,
                                    featCol3, featCol4))
                .setOutputCol("features")
                .transform(usDF2)
usDF2.printSchema()

In [None]:
assembler.select($labelCol,$"features").show(5)

In [None]:
val normalizer= new Normalizer()
                .setInputCol("features")
                .setOutputCol("normfeatures")
                .setP(2.0)
                .transform(assembler)
normalizer.show(5)
normalizer.printSchema()

In [None]:
val Array(trainingData,testData)= normalizer.randomSplit(Array(0.7,0.3))

In [None]:
val lr = new LinearRegression()
            .setLabelCol(labelCol)
            .setFeaturesCol("normfeatures")
            .setMaxIter(100)
            .setRegParam(0.3)
            .setElasticNetParam(0.8)

In [None]:
val lrModel = lr.fit(trainingData)

In [None]:
val resultDF = lrModel.transform(testData)
                .select(labelCol, "prediction")
resultDF.show(20)

In [None]:
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
resultDF.describe().show