In [9]:
import org.apache.spark.sql.{SparkSession, functions => F}
import org.apache.spark.rdd.RDD

val spark = SparkSession.builder()
      .appName("UserRatingHistoryPartitioning")
      .master("local[*]")
      .getOrCreate()

val ratingsDF = spark.read.option("header", "true").csv("gs://gcs_bucket_rupal/ratings.csv")

val validRatingsDF = ratingsDF.filter(
      F.col("userId").isNotNull &&
      F.col("movieId").isNotNull &&
      F.col("rating").isNotNull
    ).withColumn("rating", F.col("rating").cast("double"))

val ratingsRDD = validRatingsDF.rdd.map(row => {
  val userId = row.getAs[String]("userId")
  val movieId = row.getAs[String]("movieId")
  val rating = row.getAs[Double]("rating")
  (userId, (movieId, rating))
})

val ratingsGroupedByUserRDD = ratingsRDD.groupByKey().mapValues(_.toList)

spark = org.apache.spark.sql.SparkSession@3766e287
ratingsDF = [userId: string, movieId: string ... 2 more fields]
validRatingsDF = [userId: string, movieId: string ... 2 more fields]
ratingsRDD = MapPartitionsRDD[16] at map at <console>:65
ratingsGroupedByUserRDD = MapPartitionsRDD[18] at mapValues at <console>:72


import org.apache.spark.sql.{SparkSession, functions=>F}


MapPartitionsRDD[18] at mapValues at <console>:72

In [10]:
import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.{BufferedWriter, OutputStreamWriter}

val outputPath = "hdfs:///user/rupal_gupta/user-data/Q2"
    val first10Users = ratingsGroupedByUserRDD.take(10)
first10Users.foreach { case (userId, ratingsList) =>
  val userFolderPath = s"${outputPath}/${userId}/ratings.csv"
  val path = new Path(userFolderPath)
  
  val fs = FileSystem.get(new java.net.URI("hdfs:///"), new org.apache.hadoop.conf.Configuration())
  
  if (!fs.exists(path.getParent)) {
    fs.mkdirs(path.getParent)
  }

  val ratingsText = ratingsList.map { case (movieId, rating) =>
    s"${movieId}, ${rating}"
  }.mkString("\n")

  val outputStream = fs.create(path)
  val writer = new BufferedWriter(new OutputStreamWriter(outputStream))

  writer.write(ratingsText)

  writer.close()
  outputStream.close()
}
  spark.stop()

outputPath = hdfs:///user/rupal_gupta/user-data/Q2
first10Users = Array((140868,List((5,3.0), (6,4.0), (7,4.0), (11,4.0), (17,3.0), (18,3.0), (21,4.0), (25,4.0), (32,3.0), (45,3.0), (50,5.0), (52,4.0), (57,4.0), (62,3.0), (64,3.0), (65,2.0), (70,3.0), (74,3.0), (75,3.0), (82,5.0), (85,4.0), (90,4.0), (93,3.0), (95,3.0), (96,5.0), (100,4.0), (102,3.0), (112,3.0), (125,4.0), (135,3.0), (141,4.0), (163,3.0), (171,5.0), (174,3.0), (176,5.0), (186,3.0), (187,3.0), (189,4.0), (194,4.0), (195,3.0), (203,3.0), (206,4.0), (215,3.0), (223,4.0), (224,4.0), (231,1.0), (232,5.0), (234,4.0), (235,5.0), (236,3.0), (237,3.0), (248,3.0), (252,4.0), (255,3.0)...


Array((140868,List((5,3.0), (6,4.0), (7,4.0), (11,4.0), (17,3.0), (18,3.0), (21,4.0), (25,4.0), (32,3.0), (45,3.0), (50,5.0), (52,4.0), (57,4.0), (62,3.0), (64,3.0), (65,2.0), (70,3.0), (74,3.0), (75,3.0), (82,5.0), (85,4.0), (90,4.0), (93,3.0), (95,3.0), (96,5.0), (100,4.0), (102,3.0), (112,3.0), (125,4.0), (135,3.0), (141,4.0), (163,3.0), (171,5.0), (174,3.0), (176,5.0), (186,3.0), (187,3.0), (189,4.0), (194,4.0), (195,3.0), (203,3.0), (206,4.0), (215,3.0), (223,4.0), (224,4.0), (231,1.0), (232,5.0), (234,4.0), (235,5.0), (236,3.0), (237,3.0), (248,3.0), (252,4.0), (255,3.0)...