In [0]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='optumscope'), SecretScope(name='optumscope1')]

In [0]:
dbutils.secrets.list(scope = "optumscope1")

[SecretMetadata(key='optumkeysstore')]

In [0]:
spark.conf.set(
    "fs.azure.account.key.ksrdatadlsa.dfs.core.windows.net",
    dbutils.secrets.get(scope="optumscope1", key="optumkeysstore"))

In [0]:
display(dbutils.fs.ls("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData"))
                      

path,name,size,modificationTime
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Claims.csv,Claims.csv,5766,1715061116000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Hospital.csv,Hospital.csv,1528,1715061121000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/Patient_records.csv,Patient_records.csv,5110,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/disease.csv,disease.csv,1489,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/group.csv,group.csv,4390,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/subgroup.csv,subgroup.csv,561,1715018546000
abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/subscriber.csv,subscriber.csv,12061,1715018546000


In [0]:
Disease = spark.read.csv("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/disease.csv",header=True, inferSchema=True,escape='"')

In [0]:
Disease.show(5,False)

+---------+----------+------------+
|subgrp_id|disease_id|disease_name|
+---------+----------+------------+
|S101     |110001    |Beriberi    |
|S101     |110002    |Scurvy      |
|S101     |110003    |Goitre      |
|S101     |110004    |Osteoporosis|
|S101     |110005    |Rickets     |
+---------+----------+------------+
only showing top 5 rows



In [0]:
# to find the Data types columns present in the table
Disease.printSchema()

root
 |-- subgrp_id: string (nullable = true)
 |-- disease_id: integer (nullable = true)
 |-- disease_name: string (nullable = true)



In [0]:
# to find Rows count and Cloumn Count
Disease.count(),len(Disease.columns)

(60, 3)

In [0]:
# drop duplicates
Group = Group.dropDuplicates()

In [0]:
# to verify Rows count and Cloumn Count
Group.count(),len(Group.columns)

(60, 3)

In [0]:
# To find null values
Disease.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in Disease.columns]).show()

+---------+----------+------------+
|subgrp_id|disease_id|disease_name|
+---------+----------+------------+
|        0|         0|           0|
+---------+----------+------------+



In [0]:
# find distinct values
Disease.select("*").distinct().show(truncate = False)

+---------+----------+-----------------+
|subgrp_id|disease_id|disease_name     |
+---------+----------+-----------------+
|S102     |110012    |Food Poisoning   |
|S108     |110044    |Dengue           |
|S108     |110045    |Smallpox         |
|S104     |110020    |Lymphedema       |
|S105     |110030    |Food allergy     |
|S101     |110005    |Rickets          |
|S102     |110007    |Fractures        |
|S108     |110046    |Anthrax          |
|S102     |110010    |Choking          |
|S101     |110006    |Anaemia          |
|S104     |110019    |Vertigo          |
|S110     |110059    |Flu              |
|S110     |110058    |Shingles         |
|S107     |110038    |Colorectal cancer|
|S108     |110043    |Measles          |
|S103     |110014    |Glaucoma         |
|S107     |110037    |Kidney cancer    |
|S110     |110057    |Pneumonia        |
|S104     |110021    |Concussion       |
|S101     |110001    |Beriberi         |
+---------+----------+-----------------+
only showing top

In [0]:
from pyspark.sql.functions import col, asc

def analyze_columns(df):
    columns = df.columns
    for column in columns:
        distinct_count = df.select(column).distinct().count()
        null_count = df.filter(col(column).isNull()).count()
        numeric_count = df.filter(col(column).rlike("^[0-9]")).count()
        text_count = df.filter(col(column).rlike("^[A-Za-z]")).count()
        special_char_count = df.filter(~col(column).rlike("^[A-Za-z0-9]")).count()
        contains_string = df.filter(col(column).rlike("[A-Za-z]")).count()
        contains_number = df.filter(col(column).rlike("[0-9]")).count()
        contains_spl_char = df.filter(col(column).rlike("[^A-Za-z0-9]")).count()

        print("Distinct Count of {} column: {}".format(column, distinct_count))
        print("Null values Count of {} column: {}\n".format(column, null_count))
        print("{} Column Start With Number: {}".format(column, numeric_count))
        print("{} Column Start With String: {}".format(column, text_count))
        print("{} Column Start With Spl.Cha: {}".format(column, special_char_count))
        print("{} Column contains string values: {}".format(column, contains_string))
        print("{} Column contains number values: {}".format(column, contains_number))
        print("{} Column contains Spl.Char values: {}\n".format(column, contains_spl_char))
        
# Example usage:
analyze_columns(Disease)


Distinct Count of subgrp_id column: 10
Null values Count of subgrp_id column: 0

subgrp_id Column Start With Number: 0
subgrp_id Column Start With String: 60
subgrp_id Column Start With Spl.Cha: 0
subgrp_id Column contains string values: 60
subgrp_id Column contains number values: 60
subgrp_id Column contains Spl.Char values: 0

Distinct Count of disease_id column: 60
Null values Count of disease_id column: 0

disease_id Column Start With Number: 60
disease_id Column Start With String: 0
disease_id Column Start With Spl.Cha: 0
disease_id Column contains string values: 0
disease_id Column contains number values: 60
disease_id Column contains Spl.Char values: 0

Distinct Count of disease_name column: 60
Null values Count of disease_name column: 0

disease_name Column Start With Number: 0
disease_name Column Start With String: 60
disease_name Column Start With Spl.Cha: 0
disease_name Column contains string values: 60
disease_name Column contains number values: 0
disease_name Column contai

In [0]:
# Define the output container path
output_container_path = "abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/StatgingData"

# Write the DataFrame to the output container path
Disease.coalesce(1).write.mode("overwrite").option("header", "true").format("com.databricks.spark.csv").save(output_container_path)

# List files in the output container path
files = dbutils.fs.ls(output_container_path)

# Filter out the output file
output_file = [x for x in files if x.name.startswith("part-")]

# Move the output file to a specific location
dbutils.fs.mv(output_file[0].path, "%s/stg_Disease.csv" % output_container_path)


True