In [10]:
import org.apache.spark.sql.{SparkSession, functions => F}
import org.apache.spark.rdd.RDD

val spark = SparkSession.builder()
      .appName("AggregateMovieRatingsByGenre")
      .master("local[*]")
      .getOrCreate()

// Load CSV files from GCS
val moviesDF = spark.read.option("header", "true").csv("gs://gcs_bucket_rupal/movies.csv")
val ratingsDF = spark.read.option("header", "true").csv("gs://gcs_bucket_rupal/ratings.csv")
    
val movieRatingsDF = ratingsDF.join(moviesDF, "movieId")
// Split and explode the genres column
val explodedMoviesDF = movieRatingsDF.withColumn("genres", F.explode(F.split(F.col("genres"), "\\|")))

val explodedMoviesRDD: RDD[(String, String, String, String, String, String)] = explodedMoviesDF
      .rdd
      .map(row => (row.getString(0), row.getString(1), row.getString(2), 
                   row.getString(3), row.getString(4), row.getString(5)))
           
val genreMapping: Map[String, String] = Map(
      "Sci-Fi" -> "Science Fiction",
      "Film-Noir" -> "Film Noir"
    )

val rdd: RDD[(String, String, String, String, String, String)] = explodedMoviesRDD
      .map { case (movieId, userId , rating, timestamp, title, genres) =>
        val correctedGenre = genreMapping.getOrElse(genres, genres)
        (movieId, userId , rating, timestamp, title, correctedGenre)
      }
val genreRatingRDD: RDD[(String, (Double, Int))] = rdd.map { case (movieId, userId, rating, timestamp, title, genres) =>
      (genres, (rating.toDouble, 1))
    }

    // reduceByKey to accumulate the sum of ratings and count of ratings for each genre
val ratingSumAndCountRDD: RDD[(String, (Double, Int))] = genreRatingRDD
      .reduceByKey { case ((rating1, count1), (rating2, count2)) =>
        (rating1 + rating2, count1 + count2)  // Sum ratings and counts
      }

    // Calculate the average rating for each genre by dividing sum by count
val averageRatingRDD: RDD[(String, Double)] = ratingSumAndCountRDD
      .mapValues { case (sum, count) => sum / count }

averageRatingRDD.collect().foreach { case (genre, avgRating) =>
  println(s"Genre: $genre, Average Rating: $avgRating")
}

val resultDF = spark.createDataFrame(genreAverageRatingRDD).toDF("genres", "average_rating")
val outputPath = s"hdfs:///user/rupal_gupta/Q1/average_ratings.parquet"
resultDF.write.mode("overwrite").parquet(outputPath)

spark.stop()


Genre: IMAX, Average Rating: 3.593312447839248
Genre:  We're Comin' To Get Ya!"" (2014)", Average Rating: 2.0
Genre: Crime, Average Rating: 3.6917711184948736
Genre: Fantasy, Average Rating: 3.512174705402107
Genre: Western, Average Rating: 3.6001753109842554
Genre: Science Fiction, Average Rating: 3.4916991949223912
Genre: Animation, Average Rating: 3.6153322869262636
Genre: Thriller, Average Rating: 3.5317020152396505
Genre: Horror, Average Rating: 3.3071549944529486
Genre: Adventure, Average Rating: 3.5234385724723545
Genre: Romance, Average Rating: 3.5450028644529983
Genre: Documentary, Average Rating: 3.6911815290871948
Genre: War, Average Rating: 3.7916994435766664
Genre: Comedy, Average Rating: 3.4323858239436777
Genre: (no genres listed), Average Rating: 3.3681574110778767
Genre: Action, Average Rating: 3.476407141777424
Genre: Film Noir, Average Rating: 3.915774014636868
Genre: Musical, Average Rating: 3.554276956937205
Genre: Children, Average Rating: 3.4392409733948646
Genre

spark = org.apache.spark.sql.SparkSession@41aac32b
moviesDF = [movieId: string, title: string ... 1 more field]
ratingsDF = [userId: string, movieId: string ... 2 more fields]
movieRatingsDF = [movieId: string, userId: string ... 4 more fields]
explodedMoviesDF = [movieId: string, userId: string ... 4 more fields]
explodedMoviesRDD = MapPartitionsRDD[30] at map at <console>:55
genreMapping = Map(Sci-Fi -> Science Fiction, Film-Noir -> Film Noir)...


import org.apache.spark.sql.{SparkSession, functions=>F}


Map(Sci-Fi -> Science Fiction, Film-Noir -> Film Noir)...