In [0]:
spark.sql("USE CATALOG imdb")
spark.sql("USE SCHEMA bronze")   # this is the DLT target schema


DataFrame[]

In [0]:
from pyspark.sql import functions as F

def profile_table(table_name: str):
    print(f"\n===== Profiling {table_name} =====\n")
    df = spark.table(table_name)

    # Row count
    row_count = df.count()
    print(f"Row count: {row_count}\n")

    # Show schema
    df.printSchema()

    # Null counts per column
    null_counts = df.select([
        F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
        for c in df.columns
    ])
    print("\nNull counts by column:")
    null_counts.show(truncate=False)

    # For numeric columns: min, max, avg
    numeric_cols = [f.name for f in df.schema.fields
                    if f.dataType.simpleString() in ("int", "bigint", "double", "float", "decimal(3,1)", "decimal")]
    if numeric_cols:
        print("\nNumeric columns summary:")
        df.select(
            *[F.min(c).alias(f"{c}_min") for c in numeric_cols],
            *[F.max(c).alias(f"{c}_max") for c in numeric_cols],
            *[F.avg(c).alias(f"{c}_avg") for c in numeric_cols],
        ).show(truncate=False)

    # For key / categorical columns: top values
    key_candidates = [c for c in df.columns if c.endswith("const") or c.endswith("Id") or c in ("titleType","category","region","language")]
    for c in key_candidates:
        print(f"\nTop values for {c}:")
        df.groupBy(c).count().orderBy(F.desc("count")).show(10, truncate=False)


In [0]:
from ydata_profiling import ProfileReport
import pandas as pd

def generate_html(table_name, file_name):
    print(f"Generating profile for: {table_name} ...")

    # Load Spark table
    df_spark = spark.table(table_name).limit(100000)   # Safety limit
    pdf = df_spark.toPandas()

    # Generate profile
    profile = ProfileReport(
        pdf,
        title=f"Profile for {table_name}",
        explorative=True
    )

    # Create output directory in DBFS (NOT in bronze)
    dbutils.fs.mkdirs("dbfs:/FileStore/imdb_profiles")

    # Correct path for saving HTML
    output_path = f"/dbfs/FileStore/imdb_profiles/{file_name}"

    # Save HTML report
    profile.to_file(output_path)

    print(f"Saved HTML file to: {output_path}")
    print("You can download it from: FileStore → imdb_profiles")


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS imdb.bronze.imdb_profiles;

In [0]:
from ydata_profiling import ProfileReport

def generate_html(table_name, file_name):
    df_spark = spark.table(table_name).limit(150_000)
    pdf = df_spark.toPandas()
    profile = ProfileReport(
        pdf,
        title=f"Profile for {table_name}",
        explorative=True
    )
    output_path = f"/Volumes/imdb/bronze/imdb_profiles/{file_name}"
    profile.to_file(output_path)
    display(f"Saved: {output_path}")

generate_html("imdb.bronze.bronze_name_basics", "name_basics_profile.html")
generate_html("imdb.bronze.bronze_title_akas", "title_akas_profile.html")
generate_html("imdb.bronze.bronze_title_basics", "title_basics_profile.html")
generate_html("imdb.bronze.bronze_title_crew", "title_crew_profile.html")
generate_html("imdb.bronze.bronze_title_episode", "title_episode_profile.html")
generate_html("imdb.bronze.bronze_title_principals", "title_principals_profile.html")
generate_html("imdb.bronze.bronze_title_ratings", "title_ratings_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:03<00:24,  3.44s/it][A
 25%|██▌       | 2/8 [00:03<00:09,  1.60s/it][A
 38%|███▊      | 3/8 [00:04<00:06,  1.21s/it][A
 75%|███████▌  | 6/8 [00:05<00:01,  1.73it/s][A100%|██████████| 8/8 [00:05<00:00,  1.52it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/name_basics_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:01<00:14,  1.58s/it][A
 20%|██        | 2/10 [00:02<00:07,  1.11it/s][A
 30%|███       | 3/10 [00:04<00:09,  1.42s/it][A
100%|██████████| 10/10 [00:04<00:00,  3.73it/s][A100%|██████████| 10/10 [00:04<00:00,  2.41it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.savefig(


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_akas_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:04<00:45,  4.52s/it][A
 18%|█▊        | 2/11 [00:04<00:17,  2.00s/it][A
 36%|███▋      | 4/11 [00:04<00:05,  1.27it/s][A
 45%|████▌     | 5/11 [00:05<00:03,  1.70it/s][A
 73%|███████▎  | 8/11 [00:05<00:00,  3.43it/s][A
 91%|█████████ | 10/11 [00:05<00:00,  4.03it/s][A100%|██████████| 11/11 [00:05<00:00,  1.97it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_basics_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:12,  3.11s/it][A100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_crew_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:09,  2.00s/it][A100%|██████████| 6/6 [00:02<00:00,  3.00it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_episode_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:01<00:07,  1.12s/it][A
 25%|██▌       | 2/8 [00:01<00:03,  1.85it/s][A
 38%|███▊      | 3/8 [00:01<00:03,  1.62it/s][A
 50%|█████     | 4/8 [00:02<00:01,  2.37it/s][A
 62%|██████▎   | 5/8 [00:02<00:00,  3.20it/s][A
 75%|███████▌  | 6/8 [00:02<00:00,  2.78it/s][A100%|██████████| 8/8 [00:02<00:00,  3.01it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_principals_profile.html'

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:07,  1.90s/it][A100%|██████████| 5/5 [00:01<00:00,  2.62it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Saved: /Volumes/imdb/bronze/imdb_profiles/title_ratings_profile.html'

In [0]:
profile_table("imdb.bronze.bronze_name_basics")
profile_table("imdb.bronze.bronze_title_basics")
profile_table("imdb.bronze.bronze_title_akas")
profile_table("imdb.bronze.bronze_title_crew")
profile_table("imdb.bronze.bronze_title_episode")
profile_table("imdb.bronze.bronze_title_principals")
profile_table("imdb.bronze.bronze_title_ratings")
