In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, to_date, desc, date_format

In [0]:
# Create Spark session
app_name = "Gold_JobMarketplace_KPIs"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [0]:
# checking spark Session name (appname)
print('App Name :',app_name)

In [0]:
# Checking all the catalogs in unity-catalog
display(spark.sql('SHOW CATALOGS'))

In [0]:
# Checking all the avilable schemas in the catalog
display( spark.sql('SHOW SCHEMAS') )

In [0]:
# Checking the current unity-catalog and creating catalog if not present and using it
spark.sql(' CREATE CATALOG IF NOT EXISTS job_marketplace ')
spark.sql(' USE CATALOG job_marketplace ')

In [0]:
# displaying  present schemas in the job_marketplace catalog
display( spark.sql('SHOW SCHEMAS') )

In [0]:
# checking and creating the silver_layer schema in the job_marketplace catalog
spark.sql('CREATE SCHEMA IF NOT EXISTS job_marketplace.gold_layer')

In [0]:
# Checking if the silver layer schema is present in the job_marketplace catalog
display( spark.sql('SHOW SCHEMAS') )

In [0]:
df = spark.read.format("delta").table("job_marketplace.silver_layer.clean_jobs")
print("✅ Silver Layer data loaded.")
display(df)

In [0]:
# -------------------------
# KPI 1: Top Hiring Companies
# -------------------------
top_companies = (
    df.groupBy("company_name")
      .agg(count("*").alias("job_postings"))
      .orderBy(desc("job_postings"))
)

top_companies.createOrReplaceTempView("kpi_top_hiring_companies")

In [0]:
%sql
select * from kpi_top_hiring_companies limit 10

Databricks visualization. Run in Databricks to view.

In [0]:
# -------------------------
# KPI 2: Jobs by Location
# -------------------------
jobs_by_location = (
    df.groupBy("city", "state", "country")
      .agg(count("*").alias("job_postings"))
      .orderBy(desc("job_postings"))
)

jobs_by_location.createOrReplaceTempView("kpi_jobs_by_location")

In [0]:
%sql
select * from kpi_jobs_by_location

Databricks visualization. Run in Databricks to view.

In [0]:
# -------------------------
# KPI 3: Daily Job Posting Trend
# -------------------------
job_trend = (
    df.withColumn("date_posted", to_date("posted_at"))
      .groupBy("date_posted")
      .agg(count("*").alias("daily_job_postings"))
      .orderBy("date_posted")
)

job_trend.createOrReplaceTempView("kpi_daily_job_trend")

In [0]:
%sql
select * from kpi_daily_job_trend

Databricks visualization. Run in Databricks to view.

In [0]:
# -------------------------
# KPI 4: Remote vs Onsite Jobs
# -------------------------
from pyspark.sql.functions import when

# Add a column that categorizes jobs as Remote or Onsite
remote_jobs = (
    df.withColumn(
        "job_type",
        when(col("loacation").rlike("(?i)remote|anywhere"), "Remote")
        .otherwise("Onsite")
    )
    .groupBy("job_type")
    .agg(count("*").alias("count"))
    .orderBy("job_type")
)

remote_jobs.createOrReplaceTempView("kpi_remote_vs_onsite")

In [0]:
%sql
select * from kpi_remote_vs_onsite

Databricks visualization. Run in Databricks to view.

In [0]:
# -------------------------
# Write Gold Layer Tables
# -------------------------

top_companies.write.mode("overwrite").format("delta").saveAsTable("job_marketplace.gold_layer.kpi_top_companies")
jobs_by_location.write.mode("overwrite").format("delta").saveAsTable("job_marketplace.gold_layer.kpi_jobs_by_location")
job_trend.write.mode("overwrite").format("delta").saveAsTable("job_marketplace.gold_layer.kpi_job_posting_trend")
remote_jobs.write.mode("overwrite").format("delta").saveAsTable("job_marketplace.gold_layer.kpi_remote_vs_onsite")

print("✅ Gold Layer KPIs generated and saved.")