In [43]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.rdd.RDD
import scala.util.parsing.json.JSON
import org.apache.spark.sql.DataFrame

## Case Study 3: Handling Incomplete Metadata

Objective: Enrich incomplete movie metadata using additional JSON files.

Scenario: Movielens metadata (e.g., movies.csv) is missing releaseYear for some movies. Supplementary metadata in JSON format is available for enrichment.
Steps:

Ingestion:

Load movies.csv from GCP Cloud Storage as a DataFrame.
Load metadata.json from GCP Cloud Storage into an RDD for custom parsing.
Transformation:

Use RDD operations to parse the JSON file and extract movieId and releaseYear.
Perform an RDD join with the movies DataFrame to fill in missing releaseYear.
Validation:

Convert the enriched RDD back into a DataFrame.
Validate that all movies have a releaseYear field.
Storage:

Save the enriched DataFrame in Parquet format in HDFS.

In [42]:
val conf = new SparkConf()
      .setAppName("Partitioning Impact on Performance")
      .setMaster("yarn")

val sc = new SparkContext(conf)

conf = org.apache.spark.SparkConf@3a0c030
sc = org.apache.spark.SparkContext@61790a83


org.apache.spark.SparkContext@61790a83

In [44]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.util.Random

// Step 1: Initialize SparkSession
val spark = SparkSession.builder()
  .appName("Generate Metadata JSON")
  .getOrCreate()

// Step 2: Load movies.csv as a DataFrame
val bucketName = "scala_assgn_bucket"
val moviesPath = s"gs://$bucketName/ml-32m/movies.csv"
val moviesDF = spark.read.option("header", "true").csv(moviesPath)

// Step 3: Extract releaseYear from title or assign a random year
val extractYear = udf((title: String) => {
  val yearPattern = "\\((\\d{4})\\)".r
  yearPattern.findFirstMatchIn(title).map(_.group(1)).getOrElse {
    (1980 + Random.nextInt(2024 - 1980 + 1)).toString
  }
})

// Generate metadata DataFrame
val metadataDF = moviesDF
  .select("movieId", "title")
  .withColumn("releaseYear", extractYear(col("title")))

// Step 4: Write the DataFrame as a single JSON file to GCS
val outputPath = s"gs://$bucketName/ml-32m/metadata.json"

metadataDF.coalesce(1) // Ensures a single output file
  .write
  .mode("overwrite")
  .json(outputPath)

println(s"Metadata written successfully to $outputPath!")

Metadata written successfully to gs://scala_assgn_bucket/ml-32m/metadata.json!


spark = org.apache.spark.sql.SparkSession@ec536f0
bucketName = scala_assgn_bucket
moviesPath = gs://scala_assgn_bucket/ml-32m/movies.csv
moviesDF = [movieId: string, title: string ... 1 more field]
extractYear = SparkUserDefinedFunction($Lambda$6569/0x0000000802385840@4fd4588f,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
metadataDF = [movieId: string, title: string ... 1 more field]
outputPath = gs://scala_assgn_bucket/ml-32m/metadata.json


gs://scala_assgn_bucket/ml-32m/metadata.json

In [22]:
// Step 1: Load `movies.csv` as DataFrame
val moviesDF = spark.read
  .option("header", "true") // CSV has header
  .option("inferSchema", "true") // Infer data types
  .csv(moviesPath)

// Step 2: Load `metadata.json` into RDD
val metadataPath = s"gs://$bucketName/ml-32m/metadata.json"
val metadataRDD: RDD[String] = spark.sparkContext.textFile(metadataPath)

// Parse JSON to extract `movieId` and `releaseYear`
val parsedMetadataRDD: RDD[(Int, Int)] = metadataRDD.map { line =>
  JSON.parseFull(line) match {
    case Some(json: Map[String, Any]) =>
      val movieId = json.get("movieId").map(_.toString.toInt)
      val releaseYear = json.get("releaseYear").map(_.toString.toInt)
      (movieId.get, releaseYear.get)
  }
}

// Convert metadata RDD to DataFrame
val metadataFromJsonDF = parsedMetadataRDD.toDF("movieId", "releaseYear")

moviesDF = [movieId: int, title: string ... 1 more field]
metadataPath = gs://scala_assgn_bucket/ml-32m/metadata.json
metadataRDD = gs://scala_assgn_bucket/ml-32m/metadata.json MapPartitionsRDD[121] at textFile at <console>:61
parsedMetadataRDD = MapPartitionsRDD[122] at map at <console>:64
metadataFromJsonDF = [movieId: int, releaseYear: int]


           case Some(json: Map[String, Any]) =>
                           ^
It would fail on the following inputs: None, Some((x: Any forSome x not in scala.collection.immutable.Map[?,?]))
         JSON.parseFull(line) match {
                       ^


[movieId: int, releaseYear: int]

In [23]:
// Step 3: Convert movies DataFrame to RDD for join
val moviesRDD: RDD[(Int, (String, String))] = moviesDF.rdd.map(row => {
  val movieId = row.getAs[Int]("movieId")
  val title = row.getAs[String]("title")
  val genres = row.getAs[String]("genres")
  (movieId, (title, genres))
})

moviesRDD = MapPartitionsRDD[128] at map at <console>:49


MapPartitionsRDD[128] at map at <console>:49

In [37]:
// Perform RDD join to enrich `releaseYear` where missing
val enrichedRDD: RDD[(Int, (String, String))] = moviesRDD.leftOuterJoin(parsedMetadataRDD).mapValues { 
    case ((title, genres), releaseYear) =>
        var enrichedTitled = title
        if (!title.matches(".*\\(\\d{4}\\)$")) {
            enrichedTitled = s"$title (${releaseYear.get})"
        }
        (enrichedTitled, genres)
}

enrichedRDD = MapPartitionsRDD[150] at mapValues at <console>:61


MapPartitionsRDD[150] at mapValues at <console>:61

In [39]:
// Step 4: Convert enriched RDD back to DataFrame
val enrichedMoviesDF: DataFrame = enrichedRDD.map {
  case (movieId, (title, genres)) =>
    (movieId, title, genres)
}.toDF("movieId", "title", "genres")

enrichedMoviesDF = [movieId: int, title: string ... 1 more field]


[movieId: int, title: string ... 1 more field]

In [40]:
// Step 5: Validate all movies have `releaseYear`
val missingYearsCount = enrichedMoviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()
if (missingYearsCount > 0) {
  println(s"Warning: $missingYearsCount movies still missing releaseYear.")
} else {
  println("All movies have a releaseYear.")
}

missingYearsCount = 0


All movies have a releaseYear.


0

In [45]:
// Step 6: Save the enriched DataFrame as Parquet in HDFS
val outputParquetPath = "hdfs:///user/shraman_jana/enriched-movies.parquet"
enrichedMoviesDF.write.mode("overwrite").parquet(outputParquetPath)

println(s"Enriched movies data saved to $outputParquetPath")

// Stop Spark Session
spark.stop()

org.apache.spark.SparkException: Job aborted.

In [46]:
spark.stop()

lastException = null


null