In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Core column and expression functions
from pyspark.sql.functions import col, lit, when, lag, datediff

# Date-related functions
from pyspark.sql.functions import year, month

# Aggregation functions
from pyspark.sql.functions import sum as _sum, avg, count, rank

In [0]:
outpatient_silver = spark.read.parquet("/mnt/silver/outpatient/outpatient_silver")

In [0]:
from pyspark.sql.functions import datediff, col

outpatient_duration = outpatient_silver.withColumn(
    "visit_duration_days",
    datediff(col("clm_thru_dt"), col("clm_from_dt"))
).select(
    "desynpuf_id",
    "clm_id",
    "clm_from_dt",
    "clm_thru_dt",
    "visit_duration_days"
)

outpatient_duration.write.mode("overwrite").parquet("/mnt/gold/outpatient/visit_duration")


In [0]:
outpatient_duration.show()

Most common Outpatient procedures

In [0]:
from pyspark.sql.functions import count

top_outpatient_procedures = outpatient_silver.groupBy("hcpcs_cd_1") \
    .agg(count("clm_id").alias("procedure_count")) \
    .orderBy("procedure_count", ascending=False)

top_outpatient_procedures.write.mode("overwrite").parquet("/mnt/gold/outpatient/top_procedures")


In [0]:
top_outpatient_procedures.show()

Outpatient claims over time

In [0]:
claims_by_date = outpatient_silver.groupBy("clm_from_dt") \
    .agg(count("clm_id").alias("daily_claims")) \
    .orderBy("clm_from_dt")

claims_by_date.write.mode("overwrite").parquet("/mnt/gold/outpatient/claims_by_date")


In [0]:
claims_by_date.show()