In [0]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
dbutils.secrets.listScopes()

In [0]:
dbutils.secrets.list(scope = "optumscope1")

In [0]:
spark.conf.set(
    "fs.azure.account.key.ksrdatadlsa.dfs.core.windows.net",
    dbutils.secrets.get(scope="optumscope1", key="optumkeysstore"))

In [0]:
display(dbutils.fs.ls("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData"))

In [0]:
SubGroup = spark.read.csv("abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/RawData/subgroup.csv",header=True, inferSchema=True,escape='"')

In [0]:
SubGroup.show(5,False)

In [0]:
SubGroup.printSchema()

In [0]:
SubGroup.count(),len(SubGroup.columns)

In [0]:
SubGroup = SubGroup.dropDuplicates()

In [0]:
SubGroup.count(),len(SubGroup.columns)

In [0]:
# To find null values
from pyspark.sql.functions import count, col, isnan, when
SubGroup.select([count(when(col(c).isNull(), c)).alias(c) for c in SubGroup.columns]).show()

In [0]:
from pyspark.sql.functions import col, isnan, asc

def analyze_columns(df):
    columns = df.columns
    for column in columns:
        distinct_count = df.select(column).distinct().count() 
        numeric_count = df.filter(col(column).rlike("^[0-9]")).count()
        text_count = df.filter(col(column).rlike("^[A-Za-z]")).count()
        special_char_count = df.filter(~col(column).rlike("^[A-Za-z0-9]")).count()
        contains_number = df.filter(col(column).rlike("[0-9]")).count()
        contains_string = df.filter(col(column).rlike("[A-Za-z]")).count()
        contains_spl_char = df.filter(col(column).rlike("[^A-Za-z0-9]")).count()

        print("Distinct Count of {} column: {}".format(column, distinct_count))
        print("{} Column Start With Number: {}".format(column, numeric_count))
        print("{} Column Start With String: {}".format(column, text_count))
        print("{} Column Start With Spl.Cha: {}".format(column, special_char_count))
        print("{} Column contains number values: {}".format(column, contains_number))
        print("{} Column contains string values: {}".format(column, contains_string))
        print("{} Column contains Spl.Char values: {}\n".format(column, contains_spl_char))
        
# Example usage:
analyze_columns(SubGroup)

In [0]:
from pyspark.sql.functions import split, explode

# Split the "subgrp_id" column by delimiter (",") into an array
SubGroup = SubGroup.withColumn("subgrp_id", split(SubGroup["subgrp_id"], ","))

# Explode the array to create one row per element in the array
SubGroup = SubGroup.withColumn("subgrp_id", explode(SubGroup["subgrp_id"]))

In [0]:
# Show the DataFrame with the "subgrp_id" column split into separate cells
SubGroup.show(truncate=False)

In [0]:
# Define the output container path
output_container_path = "abfss://optumdata@ksrdatadlsa.dfs.core.windows.net/StatgingData"

# Write the DataFrame to the output container path
SubGroup.coalesce(1).write.mode("overwrite").option("header", "true").format("com.databricks.spark.csv").save(output_container_path)

# List files in the output container path
files = dbutils.fs.ls(output_container_path)

# Filter out the output file
output_file = [x for x in files if x.name.startswith("part-")]

# Move the output file to a specific location
dbutils.fs.mv(output_file[0].path, "%s/stg_SubGroup.csv" % output_container_path)