In [0]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("TRANSACTIONS_SILVER_TRF")\
    .config("spark.sql.shuffle.partitions", 150)\
    .config("spark.executor.memory", "2g")\
    .config("spark.executor.cores", 4)\
    .config("spark.executor.instances", 2)\
    .config("spark.driver.memory", "2g")\
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", 10000)\
    .config("spark.sql.execution.arrow.enabled", "true")\
    .config("spark.sql.adaptive.enabled", "true")\
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")\
    .config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "128m") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true")\
    .getOrCreate()

In [0]:
from pyspark.sql.functions import *
from delta.tables import *
from pyspark.sql.types import *
from pyspark.sql.window import Window



In [0]:
df_trans = spark.read.format("csv").option("inferSchema",True).option("header", True).load("/Volumes/dev/bronze/transactions/transactions_data.csv")
df_trans.display()

In [0]:
diff_ser_paiddt = df_trans.withColumn("datediff", datediff(col("PaidDate"),col("ServiceDate"))).filter(col("datediff") > 0).display()

diff_ser_paiddt_invalid = df_trans.withColumn("datediff", datediff(col("PaidDate"),col("ServiceDate"))).filter(col("datediff") < 0).display()


**remove non_alpha char**

In [0]:
print(list(df_trans.columns))

In [0]:
df_trans_col = ['TransactionID', 'EncounterID', 'PatientID', 'ProviderID', 'DeptID', 'VisitDate', 'ServiceDate', 'PaidDate', 'VisitType', 'Amount', 'AmountType', 'PaidAmount', 'ClaimID', 'PayorID', 'ProcedureCode', 'ICDCode', 'LineOfBusiness', 'MedicaidID', 'MedicareID', 'InsertDate', 'ModifiedDate']

In [0]:
for i in df_trans_col:
    df_trans = df_trans.withColumn(i,
         initcap(trim(regexp_replace(col(i), r"[^a-zA-Z0-9]", ""))))

In [0]:
df_trans.display()

In [0]:
def non_alpha(df_trans):
  for i in df_trans_col:
    df_trans.withColumn(i, upper(trim(col(i))))
  return df_trans
df_trans = non_alpha(df_trans)
df_trans.display()

In [0]:
def upper_col_names(df_trans):
    new_cols = [col(c).alias(c.upper()) for c in df_trans.columns]
    return df_trans.select(*new_cols)

df_trans = upper_col_names(df_trans)
display(df_trans)

In [0]:
display(df_trans)

In [0]:
df_trans_type = df_trans.withColumn("Paid_type",
                                     when(col("AmountType") == "Medicare", "MEDC")
                                     .when(col("AmountType") == "Medicaid", "MEDA")
                                     .when(col("AmountType") == "Insurance", "INS")
                                     .when(col("AmountType") == "Co-pay", "COPAY")
                                     .otherwise("NEW")).display()


df_trans_grp = df_trans_type.groupBy("Paid_type").agg(count("*").alias("count"))

df_trans_grp = df_trans_type.groupBy("Paid_type").agg(count("*").alias("count"))

df_trans_grp.display()

In [0]:
from pyspark.sql.functions import col, when, count

df_trans_type = df_trans.withColumn(
    "Paid_type",
    when(col("AmountType") == "Medicare", "MEDC")
    .when(col("AmountType") == "Medicaid", "MEDA")
    .when(col("AmountType") == "Insurance", "INS")
    .when(col("AmountType") == "Co-pay", "COPAY")
    .otherwise("NEW")
)

display(df_trans_type)

df_trans_grp = df_trans_type.groupBy("Paid_type").agg(count("*").alias("count"))

display(df_trans_grp)