In [0]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark.conf.set(
    "fs.azure.account.key.ksrdatadlsa.dfs.core.windows.net",
    dbutils.secrets.get(scope="optumscope1", key="optumkeysstore"))

In [0]:
Patient_Records = spark.read.csv("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Patient_records.csv",header=True, inferSchema=True,escape='"')

In [0]:
# drop duplicates
Patient_Records = Patient_Records.dropDuplicates()

In [0]:
# to verify Rows count and Cloumn Count
Patient_Records.count(),len(Patient_Records.columns)

(70, 8)

In [0]:
# To find null values
from pyspark.sql.functions import count, col, isnan, when
Patient_Records.select([count(when(col(c).isNull(), c)).alias(c) for c in Patient_Records.columns]).show()



+----------+------------+--------------+------------------+-------------+------------+----+-----------+
|Patient_id|Patient_name|patient_gender|patient_birth_date|patient_phone|disease_name|city|hospital_id|
+----------+------------+--------------+------------------+-------------+------------+----+-----------+
|         0|          17|             0|                 0|            2|           0|   0|          0|
+----------+------------+--------------+------------------+-------------+------------+----+-----------+



In [0]:
Patient_Records = Patient_Records.fillna("Guest",subset=["Patient_name"])

In [0]:
# drop phone number column
Patient_Records = Patient_Records.fillna("NA", subset="patient_phone")


In [0]:
from pyspark.sql.functions import year, current_date, to_date
# Convert the 'patient_birth_date' column to a DateType
Patient_Records = Patient_Records.withColumn("patient_birth_date", to_date("patient_birth_date"))
# Calculate the age based on the birth date
Patient_Records = Patient_Records.withColumn("age", year(current_date()) - year("patient_birth_date"))

In [0]:
Patient_Records = Patient_Records.drop(col("patient_birth_date"))

In [0]:
from pyspark.sql.functions import col, asc

def analyze_columns(df):
    columns = df.columns
    for column in columns:
        distinct_count = df.select(column).distinct().count()
        null_count = df.filter(col(column).isNull()).count()
        numeric_count = df.filter(col(column).rlike("^[0-9]")).count()
        text_count = df.filter(col(column).rlike("^[A-Za-z]")).count()
        special_char_count = df.filter(~col(column).rlike("^[A-Za-z0-9]")).count()
        contains_string = df.filter(col(column).rlike("[A-Za-z]")).count()
        contains_number = df.filter(col(column).rlike("[0-9]")).count()
        contains_spl_char = df.filter(col(column).rlike("[^A-Za-z0-9]")).count()

        print("Distinct Count of {} column: {}".format(column, distinct_count))
        print("Null values Count of {} column: {}\n".format(column, null_count))
        print("{} Column Start With Number: {}".format(column, numeric_count))
        print("{} Column Start With String: {}".format(column, text_count))
        print("{} Column Start With Spl.Cha: {}".format(column, special_char_count))
        print("{} Column contains string values: {}".format(column, contains_string))
        print("{} Column contains number values: {}".format(column, contains_number))
        print("{} Column contains Spl.Char values: {}\n".format(column, contains_spl_char))
        
# Example usage:
analyze_columns(Patient_Records)


Distinct Count of Patient_id column: 70
Null values Count of Patient_id column: 0

Patient_id Column Start With Number: 70
Patient_id Column Start With String: 0
Patient_id Column Start With Spl.Cha: 0
Patient_id Column contains string values: 0
Patient_id Column contains number values: 70
Patient_id Column contains Spl.Char values: 0

Distinct Count of Patient_name column: 53
Null values Count of Patient_name column: 0

Patient_name Column Start With Number: 0
Patient_name Column Start With String: 70
Patient_name Column Start With Spl.Cha: 0
Patient_name Column contains string values: 70
Patient_name Column contains number values: 0
Patient_name Column contains Spl.Char values: 0

Distinct Count of patient_gender column: 2
Null values Count of patient_gender column: 0

patient_gender Column Start With Number: 0
patient_gender Column Start With String: 70
patient_gender Column Start With Spl.Cha: 0
patient_gender Column contains string values: 70
patient_gender Column contains number 

In [0]:
# Define the output container path
output_container_path = "abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/StatgingData"

# Write the DataFrame to the output container path
Patient_Records.coalesce(1).write.mode("overwrite").option("header", "true").format("com.databricks.spark.csv").save(output_container_path)

# List files in the output container path
files = dbutils.fs.ls(output_container_path)

# Filter out the output file
output_file = [x for x in files if x.name.startswith("part-")]

# Move the output file to a specific location
dbutils.fs.mv(output_file[0].path, "%s/stg_Patient_Records.csv" % output_container_path)
