In [0]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark.conf.set(
    "fs.azure.account.key.ksrdatadlsa.dfs.core.windows.net",
    dbutils.secrets.get(scope="optumscope1", key="optumkeysstore"))

In [0]:
Claims = spark.read.csv("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Claims.csv",header=True, inferSchema=True,escape='"')

In [0]:
# drop duplicates
Claims = Claims.dropDuplicates()

In [0]:
# to verify Rows count and Cloumn Count
Claims.count(),len(Claims.columns)

(70, 8)

In [0]:
# To find null values
from pyspark.sql.functions import count, col, isnan, when
Claims.select([count(when(col(c).isNull(), c)).alias(c) for c in Claims.columns]).show()


+--------+----------+------------+------+-----------------+----------+------------+----------+
|claim_id|patient_id|disease_name|SUB_ID|Claim_Or_Rejected|claim_type|claim_amount|claim_date|
+--------+----------+------------+------+-----------------+----------+------------+----------+
|       0|         0|           0|     0|                0|         0|           0|         0|
+--------+----------+------------+------+-----------------+----------+------------+----------+



In [0]:
from pyspark.sql.functions import col, asc

def analyze_columns(df):
    columns = df.columns
    for column in columns:
        distinct_count = df.select(column).distinct().count()
        null_count = df.filter(col(column).isNull()).count()
        numeric_count = df.filter(col(column).rlike("^[0-9]")).count()
        text_count = df.filter(col(column).rlike("^[A-Za-z]")).count()
        special_char_count = df.filter(~col(column).rlike("^[A-Za-z0-9]")).count()
        contains_string = df.filter(col(column).rlike("[A-Za-z]")).count()
        contains_number = df.filter(col(column).rlike("[0-9]")).count()
        contains_spl_char = df.filter(col(column).rlike("[^A-Za-z0-9]")).count()

        print("Distinct Count of {} column: {}".format(column, distinct_count))
        print("Null values Count of {} column: {}\n".format(column, null_count))
        print("{} Column Start With Number: {}".format(column, numeric_count))
        print("{} Column Start With String: {}".format(column, text_count))
        print("{} Column Start With Spl.Cha: {}".format(column, special_char_count))
        print("{} Column contains string values: {}".format(column, contains_string))
        print("{} Column contains number values: {}".format(column, contains_number))
        print("{} Column contains Spl.Char values: {}\n".format(column, contains_spl_char))
        
# Example usage:
analyze_columns(Claims)


Distinct Count of claim_id column: 70
Null values Count of claim_id column: 0

claim_id Column Start With Number: 70
claim_id Column Start With String: 0
claim_id Column Start With Spl.Cha: 0
claim_id Column contains string values: 0
claim_id Column contains number values: 70
claim_id Column contains Spl.Char values: 0

Distinct Count of patient_id column: 70
Null values Count of patient_id column: 0

patient_id Column Start With Number: 70
patient_id Column Start With String: 0
patient_id Column Start With Spl.Cha: 0
patient_id Column contains string values: 0
patient_id Column contains number values: 70
patient_id Column contains Spl.Char values: 0

Distinct Count of disease_name column: 41
Null values Count of disease_name column: 0

disease_name Column Start With Number: 0
disease_name Column Start With String: 70
disease_name Column Start With Spl.Cha: 0
disease_name Column contains string values: 70
disease_name Column contains number values: 0
disease_name Column contains Spl.Ch

In [0]:
Claims.select("Claim_Or_Rejected").distinct().show()

+-----------------+
|Claim_Or_Rejected|
+-----------------+
|                Y|
|                N|
|              NaN|
+-----------------+



In [0]:
Claims = Claims.na.replace(["NaN"],"N", subset=["Claim_Or_Rejected"])

In [0]:
Claims.select("Claim_Or_Rejected").distinct().show()

+-----------------+
|Claim_Or_Rejected|
+-----------------+
|                Y|
|                N|
+-----------------+



In [0]:
Claims.select("claim_type").distinct().show() 

+----------------+
|      claim_type|
+----------------+
| claims of value|
|  claims of fact|
|claims of policy|
+----------------+



In [0]:
# Define the output container path
output_container_path = "abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/StatgingData"

# Write the DataFrame to the output container path
Claims.coalesce(1).write.mode("overwrite").option("header", "true").format("com.databricks.spark.csv").save(output_container_path)

# List files in the output container path
files = dbutils.fs.ls(output_container_path)

# Filter out the output file
output_file = [x for x in files if x.name.startswith("part-")]

# Move the output file to a specific location
dbutils.fs.mv(output_file[0].path, "%s/stg_Claims.csv" % output_container_path)


True