In [14]:
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.matching
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.regression.GeneralizedLinearRegression

In [15]:
val country = "US"
val usDF = spark.read.format("csv").option("header", "true")
                .load("data/" + country + "videos_new.csv")

country = US
usDF = [video_id: string, trending_date: string ... 14 more fields]


[video_id: string, trending_date: string ... 14 more fields]

In [16]:
usDF.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [17]:
val usDF1 = usDF.select($"category_id"
                        ,$"comment_count",$"dislikes",$"views",$"likes")
                        .na.drop()


usDF1.printSchema()

val usDF2 = usDF1.withColumn("category_id",col("category_id").cast(DoubleType))
    .withColumn("comment_count",col("comment_count").cast(IntegerType))
    .withColumn("dislikes",col("dislikes").cast(IntegerType))
    .withColumn("views",col("views").cast(IntegerType))
    .withColumn("likes",col("likes").cast(IntegerType))
usDF2.show(5)

root
 |-- category_id: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)

+-----------+-------------+--------+-------+------+
|category_id|comment_count|dislikes|  views| likes|
+-----------+-------------+--------+-------+------+
|       22.0|        15954|    2966| 748374| 57527|
|       24.0|        12703|    6146|2418783| 97185|
|       23.0|         8181|    5339|3191434|146033|
|       24.0|         2146|     666| 343168| 10172|
|       24.0|        17518|    1989|2095731|132235|
+-----------+-------------+--------+-------+------+
only showing top 5 rows



usDF1 = [category_id: string, comment_count: string ... 3 more fields]
usDF2 = [category_id: double, comment_count: int ... 3 more fields]


[category_id: double, comment_count: int ... 3 more fields]

In [18]:
val numNan = usDF.count - usDF1.count

numNan = 7188


7188

In [19]:
val assembler = new VectorAssembler()
                .setInputCols(Array("comment_count",
                                    "dislikes","views",
                                    "category_id"))
                .setOutputCol("features")
                .transform(usDF2)
usDF2.printSchema()

root
 |-- category_id: double (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)



assembler = [category_id: double, comment_count: int ... 4 more fields]


[category_id: double, comment_count: int ... 4 more fields]

In [20]:
assembler.select($"likes",$"features").show(5)

+------+--------------------+
| likes|            features|
+------+--------------------+
| 57527|[15954.0,2966.0,7...|
| 97185|[12703.0,6146.0,2...|
|146033|[8181.0,5339.0,31...|
| 10172|[2146.0,666.0,343...|
|132235|[17518.0,1989.0,2...|
+------+--------------------+
only showing top 5 rows



In [21]:
val normalizer= new Normalizer()
                .setInputCol("features")
                .setOutputCol("normfeatures")
                .setP(2.0)
                .transform(assembler)
normalizer.show(5)
normalizer.printSchema()

+-----------+-------------+--------+-------+------+--------------------+--------------------+
|category_id|comment_count|dislikes|  views| likes|            features|        normfeatures|
+-----------+-------------+--------+-------+------+--------------------+--------------------+
|       22.0|        15954|    2966| 748374| 57527|[15954.0,2966.0,7...|[0.02131320801961...|
|       24.0|        12703|    6146|2418783| 97185|[12703.0,6146.0,2...|[0.00525172527371...|
|       23.0|         8181|    5339|3191434|146033|[8181.0,5339.0,31...|[0.00256341245751...|
|       24.0|         2146|     666| 343168| 10172|[2146.0,666.0,343...|[0.00625336276642...|
|       24.0|        17518|    1989|2095731|132235|[17518.0,1989.0,2...|[0.00835860143342...|
+-----------+-------------+--------+-------+------+--------------------+--------------------+
only showing top 5 rows

root
 |-- category_id: double (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- dislikes: integer (nullable = 

normalizer = [category_id: double, comment_count: int ... 5 more fields]


[category_id: double, comment_count: int ... 5 more fields]

In [22]:
val Array(trainingData,testData)= normalizer.randomSplit(Array(0.7,0.3))

trainingData = [category_id: double, comment_count: int ... 5 more fields]
testData = [category_id: double, comment_count: int ... 5 more fields]


[category_id: double, comment_count: int ... 5 more fields]

In [23]:
val glr = new GeneralizedLinearRegression()
  .setLabelCol("likes")
  .setFeaturesCol("normfeatures")
  .setFamily("gaussian")
  .setLink("identity")
  .setMaxIter(10)
  .setRegParam(0.3)

glr = glm_560884eead9d


glm_560884eead9d

In [24]:
val glrmodel = glr.fit(trainingData)

glrmodel = glm_560884eead9d


glm_560884eead9d

In [28]:
val resultDF = glrmodel.transform(testData).select("likes", "prediction")
resultDF.show(20)

+-----+------------------+
|likes|        prediction|
+-----+------------------+
|    1| 51209.90821599215|
|    0| 59908.34628306329|
|    0|60282.643747895956|
|    0| 60288.02198352665|
|    0| 60436.16456446797|
|    0| 60438.50723823905|
|   29| 60268.67409835011|
|   29|60269.284718610346|
|  140| 60132.20699682832|
|95002| 60810.18053223193|
| 6801| 62676.37337902933|
| 6794| 62673.56715861708|
| 6876| 62696.35061057657|
|   29| 60211.37312927097|
|   30| 60294.54682149738|
|   39|60228.867586776614|
|    9| 60356.42896413803|
|   43| 60543.79755541682|
|  144| 60712.37480414659|
|   26| 61173.60980498046|
+-----+------------------+
only showing top 20 rows



resultDF = [likes: int, prediction: double]


[likes: int, prediction: double]

In [36]:
val trainingSummary = glrmodel.summary
println(s"Coefficient Standard Errors: ${trainingSummary.coefficientStandardErrors.mkString(",")}")
println(s"T Values: ${trainingSummary.tValues.mkString(",")}")
println(s"P Values: ${trainingSummary.pValues.mkString(",")}")
println(s"Dispersion: ${trainingSummary.dispersion}")

trainingSummary.residuals.show()
resultDF.describe().show

Coefficient Standard Errors: 306292.5183806047,519668.787599523,8969921.891596122,1045526.5411107829,8970892.539084295
T Values: 12.763149564738242,1.1389183588298526,4.116947642421404,-9.160939555576952,-4.109753195878782
P Values: 0.0,0.2547467562649113,3.8500099705851554E-5,0.0,3.971867321062028E-5
Dispersion: 5.506316425530897E10
+-------------------+
|  devianceResiduals|
+-------------------+
|-50365.791978009045|
| -60264.48481563479|
| -60279.44072916359|
| -60285.13972388208|
|-60286.897588029504|
| -60429.06959973276|
| -60432.91057141125|
| -59982.73899926245|
|-61043.037735275924|
| -60507.27250473946|
| -60468.49128390849|
| -60452.75995839387|
| 27074.853958405554|
| 30791.281584747136|
| -56043.44305549562|
|-55943.637052856386|
| -55882.49511617422|
| -55876.63550978899|
| -55843.21473566443|
| -55822.09730801731|
+-------------------+
only showing top 20 rows

+-------+-----------------+-------------------+
|summary|            likes|         prediction|
+-------+-----

trainingSummary = 


Coefficients:
       Feature       Estimate    Std Error T Value P Value
   (Intercept) -36868154.2824 8970892.5391 -4.1098  0.0000
normfeatures_0   3909257.2227  306292.5184 12.7631  0.0000
normfeatures_1    591860.3227  519668.7876  1.1389  0.2547
normfeatures_2  36928698.7843 8969921.8916  4.1169  0.0000
normfeatures_3  -9578005.4469 1045526.5411 -9.1609  0.0000
(Dispersion parameter for gaussian family taken to be 55063164255.3090)
    Null deviance: 1587746360152314.5000 on 28572 degrees of freedom
Residual deviance: 1573264729102687.8000 on 28572 degrees of freedom
AIC: 787864.1409
