In [None]:
import org.apache.spark.sql.{SparkSession, SaveMode, functions => F}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import java.time.Instant
import java.time.ZoneId
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

val spark = SparkSession.builder()
      .appName("TimeBasedDataPartitioning")
      .master("local[*]")
      .getOrCreate()

val ratingsDF = spark.read.option("header", "true").csv("gs://gcs_bucket_rupal/ratings.csv")

val ratingsWithYearDF = ratingsDF
  .withColumn("userId", F.col("userId").cast(StringType))
  .withColumn("movieId", F.col("movieId").cast(StringType))
  .withColumn("rating", F.col("rating").cast(DoubleType))
  .withColumn("year", F.year(F.from_unixtime(F.col("timestamp").cast("double"))))

ratingsWithYearDF.printSchema()

val ratingsRDD: RDD[(String, (String, String, Double))] = ratingsWithYearDF.rdd.map(row => {
      val userId = row.getAs[String]("userId")
      val movieId = row.getAs[String]("movieId")
      val rating = row.getAs[Double]("rating")
      val year = row.getAs[Int]("year").toString 
      (year, (userId, movieId, rating))  
    })

val groupedByYearRDD = ratingsRDD.groupByKey().mapValues(_.toList)

val schema = StructType(Seq(
  StructField("userId", StringType, nullable = true),
  StructField("movieId", StringType, nullable = true),
  StructField("rating", DoubleType, nullable = true)
))

val first10Groups = groupedByYearRDD.take(10)
first10Groups.foreach { case (year, records) =>
   val rowsRDD: RDD[Row] = spark.sparkContext.parallelize(records).map {
    case (userId, movieId, rating) => Row(userId, movieId, rating)
  }

val yearDF = spark.createDataFrame(rowsRDD, schema)
yearDF.write.mode(SaveMode.Overwrite).parquet(s"hdfs:///user/rupal_gupta/user-data/Q5/$year/ratings.parquet")
}

spark.stop()


root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- year: integer (nullable = true)

