In [2]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._

## Case Study 2: User Rating History Partitioning

Objective: Partition the Movielens dataset by user for faster query processing.

Scenario: Movielens user ratings data (CSV format) needs to be partitioned into separate folders for each user in HDFS.
Steps:

Ingestion: Load the ratings.csv file as a DataFrame from GCP Cloud Storage.

Transformation:

Use a DataFrame to filter out invalid or incomplete records.
Convert the DataFrame into an RDD to dynamically create key-value pairs of userId and their corresponding ratings.
Partitioning:

Use RDD transformations like groupByKey to partition ratings data by userId.
Write each user's data to a separate folder in HDFS using the saveAsTextFile method.
Verification:

Validate that the HDFS structure follows the format /user-data/{userId}/ratings.csv.


In [3]:
val conf = new SparkConf()
        .setAppName("User Rating History Partitioning")
        .setMaster("yarn")
val sc = new SparkContext(conf)

conf = org.apache.spark.SparkConf@777ffafe
sc = org.apache.spark.SparkContext@1fe8160


org.apache.spark.SparkContext@1fe8160

In [4]:
val bucketName = "scala_assgn_bucket"
// Load ratings.csv from GCP Cloud Storage
val ratingsPath = s"gs://$bucketName/ml-32m/ratings.csv"
val ratingsDF = spark.read.option("header", "true").csv(ratingsPath)

ratingsDF.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
|     1|     34|   2.0|943228491|
|     1|     36|   1.0|944249008|
|     1|     80|   5.0|944248943|
|     1|    110|   3.0|943231119|
|     1|    111|   5.0|944249008|
|     1|    161|   1.0|943231162|
|     1|    166|   5.0|943228442|
|     1|    176|   4.0|944079496|
|     1|    223|   3.0|944082810|
|     1|    232|   5.0|943228442|
|     1|    260|   5.0|943228696|
|     1|    302|   4.0|944253272|
|     1|    306|   5.0|944248888|
|     1|    307|   5.0|944253207|
|     1|    322|   4.0|944053801|
+------+-------+------+---------+
only showing top 20 rows



bucketName = scala_assgn_bucket
ratingsPath = gs://scala_assgn_bucket/ml-32m/ratings.csv
ratingsDF = [userId: string, movieId: string ... 2 more fields]


[userId: string, movieId: string ... 2 more fields]

In [5]:
// Filter out invalid records where rating is null or missing
val validRatingsDF = ratingsDF.filter(col("userId").isNotNull && col("userId").isNotNull && col("rating").isNotNull)

val trimmedDF = validRatingsDF.limit(100000)
// Convert DataFrame to RDD (for dynamic transformations)
val ratingsRDD = trimmedDF.rdd.map(row => {
  val userId = row.getAs[String]("userId")
  val movieId = row.getAs[String]("movieId")
  val rating = row.getAs[String]("rating").toDouble
  (userId, (movieId, rating))  // (userId, (movieId, rating))
})

ratingsRDD.take(5).foreach(println)  // Inspect a few records

(31448,(48516,4.5))
(31448,(48780,4.0))
(31448,(49272,4.5))
(31448,(54286,4.5))
(31448,(55820,5.0))


validRatingsDF = [userId: string, movieId: string ... 2 more fields]
trimmedDF = [userId: string, movieId: string ... 2 more fields]
ratingsRDD = MapPartitionsRDD[22] at map at <console>:47


MapPartitionsRDD[22] at map at <console>:47

In [6]:
// Group ratings by userId
val groupedByUserRDD = ratingsRDD.groupByKey().mapValues(_.toList)

groupedByUserRDD = MapPartitionsRDD[24] at mapValues at <console>:40


MapPartitionsRDD[24] at mapValues at <console>:40

In [8]:
import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.{BufferedWriter, OutputStreamWriter}

val outputPath = "hdfs:///user/shraman_jana/user-data/Q2"
    val first10Users = groupedByUserRDD.take(10)
first10Users.foreach { case (userId, ratingsList) =>
  val userFolderPath = s"${outputPath}/${userId}/ratings.csv"
  val path = new Path(userFolderPath)
  
  val fs = FileSystem.get(new java.net.URI("hdfs:///"), new org.apache.hadoop.conf.Configuration())
  
  if (!fs.exists(path.getParent)) {
    fs.mkdirs(path.getParent)
  }

  val ratingsText = ratingsList.map { case (movieId, rating) =>
    s"${movieId}, ${rating}"
  }.mkString("\n")

  val outputStream = fs.create(path)
  val writer = new BufferedWriter(new OutputStreamWriter(outputStream))

  writer.write(ratingsText)

  writer.close()
  outputStream.close()
}

outputPath = hdfs:///user/shraman_jana/user-data/Q2
first10Users = Array((273,List((1,4.0), (60,3.5), (260,4.5), (364,4.5), (519,1.5), (588,4.0), (595,4.5), (653,4.5), (780,3.5), (1015,3.5), (1196,3.5), (1197,4.5), (1200,1.5), (1210,4.0), (1566,2.5), (1580,3.5), (2006,4.5), (2571,4.0), (2628,4.0), (3114,4.0), (3793,4.0), (4016,5.0), (4306,4.0), (4886,4.0), (4896,4.0), (4973,4.5), (4993,5.0), (5218,4.5), (5349,3.5), (5433,4.0), (5444,4.5), (5459,3.5), (5618,3.5), (5952,4.5), (6333,4.0), (6365,2.5), (6373,3.5), (6377,3.5), (6539,4.5), (6754,2.0), (7153,4.5), (7373,3.0), (8360,4.5), (8368,4.0), (8464,2.5), (8636,...


Array((273,List((1,4.0), (60,3.5), (260,4.5), (364,4.5), (519,1.5), (588,4.0), (595,4.5), (653,4.5), (780,3.5), (1015,3.5), (1196,3.5), (1197,4.5), (1200,1.5), (1210,4.0), (1566,2.5), (1580,3.5), (2006,4.5), (2571,4.0), (2628,4.0), (3114,4.0), (3793,4.0), (4016,5.0), (4306,4.0), (4886,4.0), (4896,4.0), (4973,4.5), (4993,5.0), (5218,4.5), (5349,3.5), (5433,4.0), (5444,4.5), (5459,3.5), (5618,3.5), (5952,4.5), (6333,4.0), (6365,2.5), (6373,3.5), (6377,3.5), (6539,4.5), (6754,2.0), (7153,4.5), (7373,3.0), (8360,4.5), (8368,4.0), (8464,2.5), (8636,...

In [9]:
sc.stop()