In [0]:
from pyspark.sql import SparkSession

s3_bucket = "s3a://imdb-mvp/"

spark = (SparkSession.builder .appName("Imdb-MVP") \
        .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
        .config("spark.hadoop.fs.s3a.access.key", dbutils.secrets.get(scope="imdb-mvp", key="AWS_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.secret.key", dbutils.secrets.get(scope="imdb-mvp", key="AWS_SECRET_ACCESS_KEY")) \
        .getOrCreate() )

In [0]:
name_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "name.basics.tsv")

name_df.show(10)

In [0]:
name_df.filter(name_df.primaryName == "Christopher Nolan").show()

In [0]:
title_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.basics.tsv")

title_df.show(10)

In [0]:
tronIds = [result["tconst"] for result in title_df.filter(title_df.originalTitle.like("Tron:%")).filter(title_df.titleType == "movie").select("tconst").collect()]

In [0]:
rating_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.ratings.tsv")

rating_df.show(10)

In [0]:
from pyspark.sql import functions as F

title_filtered = title_df.select("tconst", "originalTitle")
ratingFiltered = rating_df.filter(rating_df.tconst.isin(tronIds))

titleWithRatings = (
    ratingFiltered
    .join(title_filtered, on="tconst", how="left")
)

titleWithRatings.show()



In [0]:
idCn = name_df.filter(name_df.primaryName == "Christopher Nolan").filter(name_df.primaryProfession.like("%director%")).filter(name_df.birthYear != "\\N").select("nconst").collect()[0]["nconst"]

print(idCn)

idCnTitles = [result["knownForTitles"] for result in name_df.filter(name_df.primaryName == "Christopher Nolan").filter(name_df.primaryProfession.like("%director%")).filter(name_df.birthYear != "\\N").select("knownForTitles").collect()]

splittedIds = idCnTitles[0].split(",")

print(splittedIds)

In [0]:
principals_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.principals.tsv")

principals_df.show(10)

In [0]:
cnDirectedTitlesId = [result["tconst"] for result in principals_df.filter(principals_df.nconst == idCn).filter(principals_df.category == "director").select("tconst").collect()]

print(cnDirectedTitlesId)

In [0]:
cnTitles_df = title_df.filter(title_df.tconst.isin(cnDirectedTitlesId)).select("tconst", "originalTitle", "startYear", "genres")

In [0]:
filteredCnTitlesRatings = (
    rating_df.filter(rating_df.tconst.isin(cnDirectedTitlesId)).select("tconst", "averageRating")
)

cnTitlesRatings_df = (
    cnTitles_df
    .join(filteredCnTitlesRatings, on="tconst", how="left")
)

cnTitlesRatings_df = cnTitlesRatings_df.filter(cnTitlesRatings_df.averageRating.isNotNull()).sort(cnTitlesRatings_df.averageRating.desc())

cnTitlesRatings_df.show()


In [0]:
cnTitlesRatings_df.first()

In [0]:
titleBasicsBronze = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.basics.tsv")

titleBasicsBronze.write.format("delta").mode("overwrite").save(s3_bucket + "imdb_bronze/title/basics")


In [0]:
titlePrincipalsBronze = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.principals.tsv")

titlePrincipalsBronze.write.format("delta").mode("overwrite").save(s3_bucket + "imdb_bronze/title/principals")

In [0]:
titleRatingsBronze = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "title.ratings.tsv")

titleRatingsBronze.write.format("delta").mode("overwrite").save(s3_bucket + "imdb_bronze/title/ratings")

In [0]:
nameBasicsBronze = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv(s3_bucket + "name.basics.tsv")

nameBasicsBronze.write.format("delta").mode("overwrite").save(s3_bucket + "imdb_bronze/name/basics")

In [0]:
bronzeTitlesBasics = spark.read.format("delta").load(s3_bucket + "imdb_bronze/title/basics")

In [0]:
from pyspark.sql import functions as F

total = bronzeTitlesBasics.count()

exprs = [
    F.count(F.when(F.col(c).isNull(), c)).alias(c + "_nulls")
    for c in bronzeTitlesBasics.columns
]

result = bronzeTitlesBasics.select(exprs)

# mostrar % tamb√©m
for col in result.columns:
    nulls = result.collect()[0][col]
    print(f"{col}: {nulls} ({nulls/total*100:.2f}%)")

In [0]:
bronzeTitlesBasics.printSchema()

In [0]:
silverMovies = bronzeTitlesBasics.filter(bronzeTitlesBasics.titleType == "movie")