In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, TimestampType


In [0]:
spark = SparkSession.builder.getOrCreate()

# Widgets for user input
dbutils.widgets.text("ingest_date", "", "Ingest Date")
ingest_date = dbutils.widgets.get("ingest_date")

path_pattern = "dbfs:/FileStore/tables/movie_*.json"

movies_raw_df = spark.read.option("multiline", "true").json(path_pattern)

# Combined raw data as a Delta table --> Bronze
movies_raw_df.write.format("delta").mode("overwrite").saveAsTable("bronze_movies_raw")

movies_df = movies_raw_df.select(explode(movies_raw_df['movie']).alias("movie_column"))
movies_df = movies_df.selectExpr("movie_column.*")

# Bronze Delta table (flattened)
movies_df.write.format("delta").mode("overwrite").saveAsTable("bronze_movies")


Mark as Quarantined

In [0]:
movies_df = movies_df.withColumn("ingest_date", lit(ingest_date))

# Mark negative runtime records as quarantined
quarantined_df = movies_df.filter(col('RunTime') < 0).withColumn('status', lit('quarantined'))
clean_movies_df = movies_df.filter(col('RunTime') >= 0)

# Save quarantined records separately
quarantined_df.write.format("delta").mode("overwrite").saveAsTable("quarantined_movies")

Data Cleaning for Negative Runtime

In [0]:
movies_bronze = spark.table("bronze_movies")
movies_cleaned = movies_bronze.withColumn('RunTime', abs(movies_bronze['RunTime']))

Min 1 Million Budget

In [0]:
movies_cleaned = movies_cleaned.withColumn("Budget", when(col("Budget") < 1000000, 1000000).otherwise(col("Budget")))


Genre Lookup Table -- Silver Table

In [0]:
# Unique genres
unique_genres = movies_cleaned.select('genres').distinct().withColumnRenamed('genres', 'genre')

# Save as Silver table
unique_genres.show()
unique_genres.write.format("delta").mode("overwrite").saveAsTable("silver_genres")

+--------------------+
|               genre|
+--------------------+
|[{1, Adventure}, ...|
|[{2, Fantasy}, {7...|
|[{1, Adventure}, ...|
|[{1, Adventure}, ...|
|[{4, Drama}, {5, ...|
|[{1, Adventure}, ...|
|[{6, Action}, {7,...|
|[{5, Horror}, {6,...|
|[{5, Horror}, {6,...|
|[{1, Adventure}, ...|
|[{1, Adventure}, ...|
|[{1, Adventure}, ...|
|[{4, Drama}, {7, ...|
|[{1, Adventure}, ...|
|[{1, Adventure}, ...|
|[{4, Drama}, {8, ...|
|[{4, Drama}, {8, ...|
|[{1, Adventure}, ...|
|[{1, Adventure}, ...|
|[{4, Drama}, {9, ...|
+--------------------+
only showing top 20 rows



In [0]:
# Remove duplicates
unique_movies_df = movies_bronze_df.dropDuplicates(['Id'])

# Add a new column
updated_movies_df = unique_movies_df.withColumn('status', lit('new'))


Final Silver Table

In [0]:
# Updated Silver table
updated_movies_df.write.format("delta").mode("overwrite").saveAsTable("silver_movie_table")

silver_movies_df = spark.table("silver_movie_table")
display(silver_movies_df)


BackdropUrl,Budget,CreatedBy,CreatedDate,Id,ImdbUrl,OriginalLanguage,Overview,PosterUrl,Price,ReleaseDate,Revenue,RunTime,Tagline,Title,TmdbUrl,UpdatedBy,UpdatedDate,genres,status
https://image.tmdb.org/t/p/original//lXhgCODAbBXL5buk9yEmTpOoOgR.jpg,94000000.0,,2021-04-03T16:51:30.1733333,26,https://www.imdb.com/title/tt0167260,en,"Aragorn is revealed as the heir to the ancient kings as he, Gandalf and the other members of the broken fellowship struggle to save Gondor from Sauron's forces. Meanwhile, Frodo and Sam take the ring closer to the heart of Mordor, the dark lord's realm.",https://image.tmdb.org/t/p/w342//rCzpDGLbOoPwLjy3OAm5NUPOTrC.jpg,9.9,2003-12-01T00:00:00,1118888979.0,201,The eye of the enemy is moving.,The Lord of the Rings: The Return of the King,https://www.themoviedb.org/movie/122,,,"List(List(1, Adventure), List(2, Fantasy), List(6, Action))",new
https://image.tmdb.org/t/p/original//orjiB3oUIsyz60hoEqkiGpy5CeO.jpg,356000000.0,,2021-04-03T16:51:30.1733333,29,https://www.imdb.com/title/tt4154796,en,"After the devastating events of Avengers: Infinity War, the universe is in ruins due to the efforts of the Mad Titan, Thanos. With the help of remaining allies, the Avengers must assemble once more in order to undo Thanos' actions and restore order to the universe once and for all, no matter what consequences may be in store.",https://image.tmdb.org/t/p/w342//ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,9.9,2019-04-24T00:00:00,2797800564.0,181,Part of the journey is the end.,Avengers: Endgame,https://www.themoviedb.org/movie/299534,,,"List(List(1, Adventure), List(6, Action), List(13, Science Fiction))",new
https://image.tmdb.org/t/p/original//jlh5bNiSPcbQ79Nt31kE2GxIR3h.jpg,170000000.0,,2021-04-03T16:51:30.2433333,474,https://www.imdb.com/title/tt0970179,en,"Orphaned and alone except for an uncle, Hugo Cabret lives in the walls of a train station in 1930s Paris. Hugo's job is to oil and maintain the station's clocks, but to him, his more important task is to protect a broken automaton and notebook left to him by his late father. Accompanied by the goddaughter of an embittered toy merchant, Hugo embarks on a quest to solve the mystery of the automaton and find a place he can call home.",https://image.tmdb.org/t/p/w342//vDAhBTIHvIoNOlgCfmLirVgUK2p.jpg,9.9,2011-11-22T00:00:00,185770160.0,126,One of the most legendary directors of our time takes you on an extraordinary adventure.,Hugo,https://www.themoviedb.org/movie/44826,,,"List(List(1, Adventure), List(4, Drama), List(17, Family))",new
https://image.tmdb.org/t/p/original//x0VXCWSTny5JRvpgDnw5ptwQyhA.jpg,88000000.0,,2021-04-03T16:51:30.3733333,964,https://www.imdb.com/title/tt3829266,en,"When a kid accidentally triggers the universe's most lethal hunters' return to Earth, only a ragtag crew of ex-soldiers and a disgruntled female scientist can prevent the end of the human race.",https://image.tmdb.org/t/p/w342//wMq9kQXTeQCHUZOG4fAe5cAxyUA.jpg,9.9,2018-09-05T00:00:00,160542134.0,107,The hunt has evolved,The Predator,https://www.themoviedb.org/movie/346910,,,"List(List(1, Adventure), List(5, Horror), List(6, Action), List(7, Comedy), List(10, Thriller), List(13, Science Fiction))",new
https://image.tmdb.org/t/p/original//l7gkk5K0NnVSqZ0Qr4zkNskH9AC.jpg,8000000.0,,2021-04-03T16:51:30.5266667,1677,https://www.imdb.com/title/tt0804497,en,A clinically depressed teenager gets a new start after he checks himself into an adult psychiatric ward.,https://image.tmdb.org/t/p/w342//jAIoL0275PFrx9jrmneWAqmBvtD.jpg,9.9,2010-10-08T00:00:00,6491240.0,102,Sometimes what's in your head isn't as crazy as you think.,It's Kind of a Funny Story,https://www.themoviedb.org/movie/43923,,,"List(List(4, Drama), List(7, Comedy))",new
https://image.tmdb.org/t/p/original//u4d4vymNPo1UuJ6ihtyy9OQgco2.jpg,140000000.0,,2021-04-03T16:51:30.5300000,1697,https://www.imdb.com/title/tt0122151,en,"In the combustible action franchise's final installment, maverick detectives Martin Riggs and Roger Murtaugh square off against Asian mobster Wah Sing Ku, who's up to his neck in slave trading and counterfeit currency. With help from gumshoe Leo Getz and smart-aleck rookie cop Lee Butters, Riggs and Murtaugh aim to take down Ku and his gang.",https://image.tmdb.org/t/p/w342//qc0GVMhZk44icQdynIin0xkE9YZ.jpg,9.9,1998-07-10T00:00:00,285444603.0,127,The faces you love. The action you expect.,Lethal Weapon 4,https://www.themoviedb.org/movie/944,,,"List(List(1, Adventure), List(6, Action), List(7, Comedy), List(10, Thriller), List(11, Crime))",new
https://image.tmdb.org/t/p/original//per8xKUsoXn15nPNFwkqUdTk7SY.jpg,30000000.0,,2021-04-03T16:51:30.5466667,1806,https://www.imdb.com/title/tt0329101,en,"In an attempt to free himself from a state of forgotten limbo, evil dream-demon Freddy Krueger (Robert Englund) devises a plan to manipulate un-dead mass murderer Jason Voorhees (Ken Kirzenger) into slicing-and-dicing his way through the teenage population of Springwood. But when the master of dreams loses control of his monster, a brutal fight to the death is the only way out in this long anticipated crossover between two of modern horror's most notorious killers!",https://image.tmdb.org/t/p/w342//gJuWIl3xQ0QAxwIWnxsoT56bcfH.jpg,9.9,2003-08-15T00:00:00,114908830.0,97,Evil Will Battle Evil,Freddy vs. Jason,https://www.themoviedb.org/movie/6466,,,"List(List(5, Horror))",new
https://image.tmdb.org/t/p/original//5bIHG1KqyIPJphtKazUXrR7voQc.jpg,60000000.0,,2021-04-03T16:51:30.5700000,1950,https://www.imdb.com/title/tt0313737,en,"Dedicated environmental lawyer Lucy Kelson goes to work for billionaire George Wade as part of a deal to preserve a community center. Indecisive and weak-willed George grows dependent on Lucy's guidance on everything from legal matters to clothing. Exasperated, Lucy gives notice and picks Harvard graduate June Carter as her replacement. As Lucy's time at the firm nears an end, she grows jealous of June and has second thoughts about leaving George.",https://image.tmdb.org/t/p/w342//a5HMBDM66DGX8atTR77Kf8M2pb4.jpg,9.9,2002-12-19T00:00:00,93354918.0,101,Over. Done. Finished. A comedy about love at last glance.,Two Weeks Notice,https://www.themoviedb.org/movie/2642,,,"List(List(7, Comedy), List(16, Romance))",new
https://image.tmdb.org/t/p/original//lLWz9wBMwYy1YGzpa5tq9bHvCpZ.jpg,6000000.0,,2021-04-03T16:51:30.5833333,2040,https://www.imdb.com/title/tt0455590,en,"Young Scottish doctor, Nicholas Garrigan decides it's time for an adventure after he finishes his formal education, so he decides to try his luck in Uganda, and arrives during the downfall of President Obote. General Idi Amin comes to power and asks Garrigan to become his personal doctor.",https://image.tmdb.org/t/p/w342//quckyadj0bYCzIXMDIi4lrwJfth.jpg,9.9,2006-01-12T00:00:00,48027970.0,123,Charming. Magnetic. Murderous.,The Last King of Scotland,https://www.themoviedb.org/movie/1523,,,"List(List(4, Drama))",new
https://image.tmdb.org/t/p/original//2NslMU17WZqHF7AnjIBpu9VuKh0.jpg,68000000.0,,2021-04-03T16:51:30.6100000,2214,https://www.imdb.com/title/tt0285531,en,"Four boyhood pals perform a heroic act and are changed by the powers they gain in return. Years later, on a hunting trip in the Maine woods, they're overtaken by a vicious blizzard that harbors an ominous presence. Challenged to stop an alien force, the friends must first prevent the slaughter of innocent civilians by a military vigilante ... and then overcome a threat to the bond that unites the four of them.",https://image.tmdb.org/t/p/w342//vEA305YK7QCjK02NP5OFqYsDEVQ.jpg,9.9,2003-03-21T00:00:00,75700000.0,136,A circle of friendship. A web of mystery. A pattern of fear.,Dreamcatcher,https://www.themoviedb.org/movie/6171,,,"List(List(4, Drama), List(5, Horror), List(10, Thriller), List(13, Science Fiction))",new
