In [None]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame

val spark = SparkSession.builder()
      .appName("DuplicateRecordRemoval")
      .master("local[*]")
      .getOrCreate()

val moviesGCS = spark.read.option("header", "true").csv("gs://gcs_bucket_rupal/movies.csv")

// Write the file to HDFS
moviesGCS.write
  .option("header", "true")
  .mode("overwrite")         
  .csv("hdfs:///user/rupal_gupta/Q4/movies.csv")


val moviesDF = spark.read.option("header", "true").csv("hdfs:///user/rupal_gupta/Q4/movies.csv")

// Add duplicates
val sampleMoviesDF = moviesDF.limit(500) 
val duplicateMoviesDF = moviesDF.union(sampleMoviesDF) 

duplicateMoviesDF.write
  .option("header", "true")
  .mode("overwrite") 
  .csv("hdfs:///user/rupal_gupta/Q4/duplicated_movies.csv")

println("500 duplicates inserted and file updated successfully!")


val initialCount = duplicateMoviesDF.count()
println(s"Initial movie record count: $initialCount")

val cleanedDF = duplicateMoviesDF.dropDuplicates(Seq("movieId", "title"))

val cleanedRDD: RDD[(String, String, String)] = cleanedDF.rdd.map(row => {
val movieId = row.getAs[String]("movieId")
    val title = row.getAs[String]("title")
    val genres = row.getAs[String]("genres")
    (movieId, title, genres) 
})

val distinctRDD = cleanedRDD.distinct()

val distinctDF = spark.createDataFrame(distinctRDD)
    .toDF("movieId", "title", "genres")


    // Count the number of duplicates removed
val finalCount = distinctDF.count()
val duplicatesRemoved = initialCount - finalCount
println(s"Duplicates removed: $duplicatesRemoved")
println(s"Final record count: $finalCount")

distinctDF.write
    .format("avro")
    .save("gs://gcs_rupal_gupta/cleaned_movies.avro")

spark.stop()