In [0]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='optumscope'), SecretScope(name='optumscope1')]

In [0]:
dbutils.secrets.list(scope = "optumscope1")

[SecretMetadata(key='optumkeysstore')]

In [0]:
spark.conf.set(
    "fs.azure.account.key.ksrdatadlsa.dfs.core.windows.net",
    dbutils.secrets.get(scope="optumscope1", key="optumkeysstore"))

In [0]:
display(dbutils.fs.ls("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData"))
                      

path,name,size,modificationTime
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Claims.csv,Claims.csv,5766,1715061116000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Hospital.csv,Hospital.csv,1528,1715061121000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Patient_records.csv,Patient_records.csv,5110,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/disease.csv,disease.csv,1489,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/group.csv,group.csv,4390,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/subgroup.csv,subgroup.csv,561,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/subscriber.csv,subscriber.csv,12061,1715018546000


In [0]:
Hospital = spark.read.csv("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Hospital.csv",header=True, inferSchema=True,escape='"')

In [0]:
Hospital.show(5,False)

+-----------+-----------------------------------------------------------------+----------+----------+-------+
|Hospital_id|Hospital_name                                                    |city      |state     |country|
+-----------+-----------------------------------------------------------------+----------+----------+-------+
|H1000      |All India Institute of Medical Sciences                          |New Delhi |NaN       |India  |
|H1001      |Medanta The Medicity                                             |Gurgaon   |Haryana   |India  |
|H1002      |The Christian Medical College                                    |Vellore   |Tamil Nadu|India  |
|H1003      |PGIMER - Postgraduate Institute of Medical Education and Research|Chandigarh|Haryana   |India  |
|H1004      |Apollo Hospital - Chennai                                        |Chennai   |Tamil Nadu|India  |
+-----------+-----------------------------------------------------------------+----------+----------+-------+
only showi

In [0]:
# to find the Data types columns present in the table
Hospital.printSchema()

root
 |-- Hospital_id: string (nullable = true)
 |-- Hospital_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)



In [0]:
# to find Rows count and Cloumn Count
Hospital.count(),len(Hospital.columns)

(20, 5)

In [0]:
# drop duplicates
Hospital = Hospital.dropDuplicates()

In [0]:
# to verify Rows count and Cloumn Count
Hospital.count(),len(Hospital.columns)

(20, 5)

In [0]:
# To find null values
Hospital.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in Hospital.columns]).show()

+-----------+-------------+----+-----+-------+
|Hospital_id|Hospital_name|city|state|country|
+-----------+-------------+----+-----+-------+
|          0|            0|   0|    4|      0|
+-----------+-------------+----+-----+-------+



In [0]:
# find distinct values
Hospital.select("*").distinct().show(truncate = False)

+-----------+-----------------------------------------------------------------+----------+-----------+-------+
|Hospital_id|Hospital_name                                                    |city      |state      |country|
+-----------+-----------------------------------------------------------------+----------+-----------+-------+
|H1010      |Lilavati Hospital And Research Centre                            |Mumbai    |Maharashtra|India  |
|H1017      |Manipal Hospitals                                                |Bengaluru |Karnataka  |India  |
|H1006      |Breach Candy Hospital                                            |Mumbai    |Maharashtra|India  |
|H1016      |Jaslok Hospital and Research Centre                              |Mumbai    |Maharashtra|India  |
|H1012      |Bombay Hospital & Medical Research Centre                        |Mumbai    |Maharashtra|India  |
|H1018      |Yashoda Hospital Secunderabad                                    |Hyderabad |Telangana  |India  |
|

In [0]:
# to replace New Delhi with Delhi in city column
# To replace nan valu with Unitary_Territory in state Column

Hospital = Hospital.replace(["New Delhi"], "Delhi","city")
Hospital = Hospital.na.replace(["NaN"],"Unitary_Territory", subset=["state"])

In [0]:
# To find null values
Hospital.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in Hospital.columns]).show()

+-----------+-------------+----+-----+-------+
|Hospital_id|Hospital_name|city|state|country|
+-----------+-------------+----+-----+-------+
|          0|            0|   0|    0|      0|
+-----------+-------------+----+-----+-------+



In [0]:
Hospital.show()

+-----------+--------------------+----------+-----------------+-------+
|Hospital_id|       Hospital_name|      city|            state|country|
+-----------+--------------------+----------+-----------------+-------+
|      H1010|Lilavati Hospital...|    Mumbai|      Maharashtra|  India|
|      H1017|   Manipal Hospitals| Bengaluru|        Karnataka|  India|
|      H1006|Breach Candy Hosp...|    Mumbai|      Maharashtra|  India|
|      H1016|Jaslok Hospital a...|    Mumbai|      Maharashtra|  India|
|      H1012|Bombay Hospital &...|    Mumbai|      Maharashtra|  India|
|      H1018|Yashoda Hospital ...| Hyderabad|        Telangana|  India|
|      H1015|Fortis Hospital M...|    Mumbai|      Maharashtra|  India|
|      H1009|Indraprastha Apol...|     Delhi|Unitary_Territory|  India|
|      H1014|Fortis Hiranandan...|    Mumbai|      Maharashtra|  India|
|      H1004|Apollo Hospital -...|   Chennai|       Tamil Nadu|  India|
|      H1002|The Christian Med...|   Vellore|       Tamil Nadu| 

In [0]:
from pyspark.sql.functions import col, asc

def analyze_columns(df):
    columns = df.columns
    for column in columns:
        distinct_count = df.select(column).distinct().count()
        null_count = df.filter(col(column).isNull()).count()
        numeric_count = df.filter(col(column).rlike("^[0-9]")).count()
        text_count = df.filter(col(column).rlike("^[A-Za-z]")).count()
        special_char_count = df.filter(~col(column).rlike("^[A-Za-z0-9]")).count()
        contains_string = df.filter(col(column).rlike("[A-Za-z]")).count()
        contains_number = df.filter(col(column).rlike("[0-9]")).count()
        contains_spl_char = df.filter(col(column).rlike("[^A-Za-z0-9]")).count()

        print("Distinct Count of {} column: {}".format(column, distinct_count))
        print("Null values Count of {} column: {}\n".format(column, null_count))
        print("{} Column Start With Number: {}".format(column, numeric_count))
        print("{} Column Start With String: {}".format(column, text_count))
        print("{} Column Start With Spl.Cha: {}".format(column, special_char_count))
        print("{} Column contains string values: {}".format(column, contains_string))
        print("{} Column contains number values: {}".format(column, contains_number))
        print("{} Column contains Spl.Char values: {}\n".format(column, contains_spl_char))
        
# Example usage:
analyze_columns(Hospital)


Distinct Count of Hospital_id column: 20
Null values Count of Hospital_id column: 0

Hospital_id Column Start With Number: 0
Hospital_id Column Start With String: 20
Hospital_id Column Start With Spl.Cha: 0
Hospital_id Column contains string values: 20
Hospital_id Column contains number values: 20
Hospital_id Column contains Spl.Char values: 0

Distinct Count of Hospital_name column: 20
Null values Count of Hospital_name column: 0

Hospital_name Column Start With Number: 0
Hospital_name Column Start With String: 20
Hospital_name Column Start With Spl.Cha: 0
Hospital_name Column contains string values: 20
Hospital_name Column contains number values: 0
Hospital_name Column contains Spl.Char values: 20

Distinct Count of city column: 8
Null values Count of city column: 0

city Column Start With Number: 0
city Column Start With String: 20
city Column Start With Spl.Cha: 0
city Column contains string values: 20
city Column contains number values: 0
city Column contains Spl.Char values: 0

D

In [0]:
# Define the output container path
output_container_path = "abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/StatgingData"

# Write the DataFrame to the output container path
Hospital.coalesce(1).write.mode("overwrite").option("header", "true").format("com.databricks.spark.csv").save(output_container_path)

# List files in the output container path
files = dbutils.fs.ls(output_container_path)

# Filter out the output file
output_file = [x for x in files if x.name.startswith("part-")]

# Move the output file to a specific location
dbutils.fs.mv(output_file[0].path, "%s/stg_Hospital.csv" % output_container_path)


True