In [0]:
from itertools import chain
from pyspark.sql.functions import to_date, date_format, coalesce, col,count, when, create_map, lit
import os

creating spark section

In [0]:
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("DeltaTableCheckAndAppend").getOrCreate()

# Customer

In [0]:

from pyspark.sql.functions import col, count, when
# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Customer"

# Function to check if Delta table exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False

# Function to get the first available CSV file dynamically
def get_csv_path(container="bronze", folder="Static_Data"):
    base_path = f"abfss://{container}@scmdataset2025.dfs.core.windows.net/{folder}/"
    try:
        files = dbutils.fs.ls(base_path)
        # Return the first CSV file found
        for file in files:
            if file.name.endswith(".csv"):
                return base_path + file.name
        raise FileNotFoundError(f"No CSV file found in {base_path}")
    except Exception as e:
        print(f"Failed to list files in {base_path}: {e}")
        return None

# Main logic
if not delta_table_exists(delta_path):
    csv_path = get_csv_path()
    if csv_path:
        # Read the CSV file dynamically
        df_customers = spark.read.format("csv")\
            .options(header="true")\
            .options(inferSchema="true")\
            .load(csv_path)

        # Create a temporary view
        df_customers.createOrReplaceTempView("temp_customers_view")

        # Check for duplicate values
        duplicate_counts = {}
        for column in df_customers.columns:
            duplicate_count = df_customers.groupBy(column).count().filter("count > 1").count()
            duplicate_counts[column] = duplicate_count

        for column, count in duplicate_counts.items():
            print(f"{column}: {count}")

        # Drop duplicate rows based on Customer_ID
        df_customers = df_customers.dropDuplicates(["Customer_ID"])
        

        # Drop the "Customer_Name" column
        df_customers = df_customers.drop("Customer_Name")

        # Write the DataFrame to the Delta table
        df_customers.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(delta_path)

        # Optimize Delta table to maintain a single file
        spark.sql(f"OPTIMIZE delta.`{delta_path}`")

        print("File saved in the silver layer")
        print("Data processed successfully.")
    else:
        print("CSV file not found. Please check the directory.")
else:
    print("Delta table already exists in the silver layer.")


#product

In [0]:

# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Product"

# Check if _delta_log directory exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False
    
# if delta_log not exist
if not delta_table_exists(delta_path):
    # Read the CSV file
    df_product = spark.read.format("csv")\
        .options(header="true")\
        .options(inferSchema="true")\
        .load("abfss://bronze@scmdataset2025.dfs.core.windows.net/Static_Data/Product_Data.csv")

    # Create a temporary view for the Product data
    df_product.createOrReplaceTempView("temp_product_view")

    # Drop duplicates based on Product_ID
    df_product = df_product.dropDuplicates(["Product_ID"])
    display(df_product)

    # Write data to Delta table 
    print("Writing data to Delta table...")
    df_product.write.format("delta")\
        .mode("overwrite")\
        .option("mergeSchema", "true")\
        .save(delta_path)

    # Optimize Delta table to maintain a single file
    spark.sql(f"OPTIMIZE delta.`{delta_path}`")

    print("Data processed successfully.")
else:
    # If Delta table already exists, do nothing
    print("Delta table already exists. Skipping data processing.")

# Warehouse

finding last uploaded file path

In [0]:
%python
# List the files in the directory
files = dbutils.fs.ls("abfss://bronze@scmdataset2025.dfs.core.windows.net/Stream_Data/Warehouse/")

# Sort the files by modification time and take the last one
last_file = sorted(files, key=lambda x: x.modificationTime)[-1].path

display(last_file)

In [0]:
#read the dataset
df_warehouse=spark.read.format("csv")\
     .options(header="true")\
     .options(inferSchema="true")\
     .load(last_file)

display(df_warehouse)

In [0]:
# creating tempview
df_warehouse.createOrReplaceTempView("temp_warehouse_view")

In [0]:


# Adjust the date format in the 'Date' column
df_warehouse = df_warehouse.withColumn(
    "Date",
    date_format(
        coalesce(
            to_date(col("Date"), "dd-MM-yyyy"),
            to_date(col("Date"), "dd MM yyyy"),
            to_date(col("Date"), "yyyy-MM-dd"),
            to_date(col("Date"), "yyyy-dd-MM"),
            to_date(col("Date"), "yyyy_dd_MM"),
            to_date(col("Date"), "dd_MM_yyyy")
        ),
        "dd-MM-yyyy"
    )
)

# Display the updated DataFrame
display(df_warehouse)

In [0]:
%python


# Create a mapping dictionary for Warehouse ID and Location
warehouse_mapping = {
    "W001": "New_Delhi",
    "W002": "Lucknow",
    "W003": "Chandigarh",
    "W004": "Mumbai",
    "W005": "Ahmedabad",
    "W006": "Jaipur",
    "W007": "Kolkata",
    "W008": "Bhubaneswar",
    "W009": "Patna",
    "W010": "Bhopal",
    "W011": "Raipur",
    "W012": "Ranchi",
    "W013": "Bangalore",
    "W014": "Hyderabad",
    "W015": "Chennai"
}

# Convert the dictionary to a PySpark map
mapping_expr = create_map([lit(x) for x in chain(*warehouse_mapping.items())])

# Replace values directly in the Warehouse_Location column
df_warehouse = df_warehouse.withColumn(
    "Warehouse_Location",
    when(
        col("Warehouse_ID").isNotNull() & (df_warehouse["Warehouse_Location"] != mapping_expr.getItem(col("Warehouse_ID"))),
        mapping_expr.getItem(col("Warehouse_ID"))
    ).otherwise(col("Warehouse_Location"))
)

# Display the updated DataFrame
display(df_warehouse)


save the files in silver layer

In [0]:
# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Warehouse"

# Check if _delta_log directory exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False

# Check and write/append accordingly
if delta_table_exists(delta_path):
    # Append to existing Delta table with schema merging
    print("Delta table exists. Appending new data.")
    df_warehouse.write.format("delta").mode("append").option("mergeSchema", "true").save(delta_path)
else:
    # Create a new Delta table if delta table not exist
    print("Delta table does not exist. Creating a new Delta table.")
    df_warehouse.write.format("delta").mode("overwrite").save(delta_path)

# Optimize Delta table to maintain a single file
spark.sql(f"OPTIMIZE delta.`{delta_path}`")

print("Data processed successfully.")

In [0]:
from pyspark.sql.functions import col, desc

# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Warehouse"

# Read the Delta table
delta_df = spark.read.format("delta").load(delta_path)

# Display the file with the largest Total_Cost
largest_file_df = delta_df
display(largest_file_df)

# Get the size of the files in the folder
files = dbutils.fs.ls(delta_path)
file_sizes = [(file.name, file.size) for file in files]
display(file_sizes)

#Supply dataset

In [0]:
%python
# List the files in the directory
files = dbutils.fs.ls("abfss://bronze@scmdataset2025.dfs.core.windows.net/Stream_Data/Supply")

# Sort the files by modification time and take the last one
last_file = sorted(files, key=lambda x: x.modificationTime)[-1].path

display(last_file)

In [0]:
# read the datset
df_supply = spark.read.format("csv")\
    .option("header", True)\
    .option("inferSchema", True)\
    .load(last_file)

#display dataset

display(df_supply)

In [0]:
#creating tempview
df_supply.createOrReplaceTempView("temp_view_supply")

In [0]:
# Create a list to store the results
null_counts = []

# Iterate through each column and count null values
for column in df_supply.columns:
    null_count = df_supply.filter(col(column).isNull()).count()
    null_counts.append((column, null_count))

# Convert the list to a DataFrame for better visualization
df_supply2= spark.createDataFrame(null_counts, ["Column", "Null Count"])

# Display the null counts for each column
display(df_supply2)

In [0]:
#drop supplier name column
df_1= df_supply.drop("Supplier_Name")
display(df_1)

In [0]:
from pyspark.sql.functions import col, coalesce, date_format, to_date

# Adjust the date format in the 'Date' column
df_supply = df_1.withColumn(
    "Date",
    date_format(
        coalesce(
            to_date(col("Date"), "dd-MM-yyyy"),
            to_date(col("Date"), "dd MM yyyy"),
            to_date(col("Date"), "yyyy-MM-dd"),
            to_date(col("Date"), "yyyy-dd-MM"),
            to_date(col("Date"), "yyyy_dd_MM"),
            to_date(col("Date"), "dd_MM_yyyy")
        ),
        "dd-MM-yyyy"
    )
)

# Display the updated DataFrame
display(df_supply)

In [0]:
from pyspark.sql.functions import when

# Define the correct mapping of Supplier_ID to Supplier_State
correct_mapping = {
    "S001": "Delhi",
    "S002": "Maharashtra",
    "S003": "West_Bengal",
    "S004": "Madhya_Pradesh",
    "S005": "Karnataka"
}

# Create a DataFrame with the correct mapping
mapping_df = spark.createDataFrame(
    [(k, v) for k, v in correct_mapping.items()],
    ["Supplier_ID", "Correct_Supplier_State"]
)

# Join the original DataFrame with the mapping DataFrame
df_check = df_supply.join(mapping_df, on="Supplier_ID", how="left")

# Replace incorrect Supplier_State with the correct value
df_check = df_check.withColumn(
    "Supplier_State",
    when(df_check["Supplier_State"] != df_check["Correct_Supplier_State"], df_check["Correct_Supplier_State"]).otherwise(df_check["Supplier_State"])
)

# Display the DataFrame with the updated Supplier_State
display(df_check)

In [0]:
# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Supply"

# Check if _delta_log directory exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False

# Check and write/append accordingly
if delta_table_exists(delta_path):
    # Append to existing Delta table with schema merging
    print("Delta table exists. Appending new data.")
    df_check.write.format("delta").mode("append").option("mergeSchema", "true").save(delta_path)
else:
    # Create a new Delta table
    print("Delta table does not exist. Creating a new Delta table.")
    df_check.write.format("delta").mode("overwrite").option("mergeSchema", "true").option("columnMapping", "name").save(delta_path)

# Optimize Delta table to maintain a single file
spark.sql(f"OPTIMIZE delta.`{delta_path}`")

print("Data processed successfully.")

#Orders

In [0]:
%python
# List the files in the directory
files = dbutils.fs.ls("abfss://bronze@scmdataset2025.dfs.core.windows.net/Stream_Data/Orders")

# Sort the files by modification time and take the last one
last_file = sorted(files, key=lambda x: x.modificationTime)[-1]

# Display the last file
print(last_file)

In [0]:
df_orders = spark.read.format("csv")\
     .options(header="true")\
     .options(inferSchema="true")\
     .load(last_file.path)

display(df_orders)

In [0]:
duplicate_counts = {}

for column in df_orders.columns:
    duplicate_values = (
        df_orders.groupBy(column)
        .count()
        .filter(col("count") > 1)
        .select(col(column).alias("value"), col("count").alias("duplicate_count"))
    )
    duplicate_counts[column] = duplicate_values.count()

for column, count in duplicate_counts.items():
    print(f"{column}: {count}")

In [0]:
df_orders = df_orders.dropDuplicates()
display(df_orders)

In [0]:
# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/orders"

# Check if _delta_log directory exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False

# Check and write/append accordingly
if delta_table_exists(delta_path):
    # Append to existing Delta table with schema merging
    print("Delta table exists. Appending new data.")
    df_orders.write.format("delta").mode("append").option("mergeSchema", "true").option("enableColumnMapping", "true").save(delta_path)
else:
    # Create a new Delta table
    print("Delta table does not exist. Creating a new Delta table.")
    df_orders.write.format("delta").mode("overwrite").save(delta_path)

# Optimize Delta table to maintain a single file
spark.sql(f"OPTIMIZE delta.`{delta_path}`")

print("Data processed successfully.")

#Logistics

In [0]:
%python
# List the files in the directory
files = dbutils.fs.ls("abfss://bronze@scmdataset2025.dfs.core.windows.net/Stream_Data/Logistics")

# Sort the files by modification time and take the last one
last_file = sorted(files, key=lambda x: x.modificationTime)[-1]

# Display the last file
print(last_file)

In [0]:
df = spark.read.format("csv")\
     .options(header="true")\
     .options(inferSchema="true")\
     .load(last_file.path)

display(df)

In [0]:
# Define the Delta table path
delta_path = "abfss://silver@scmdataset2025.dfs.core.windows.net/Logistics"

# Check if _delta_log directory exists
def delta_table_exists(path):
    try:
        files = dbutils.fs.ls(path)
        return any(file.name == "_delta_log/" for file in files)
    except Exception as e:
        print(f"Failed to access path: {e}")
        return False

# Check and write/append accordingly
if delta_table_exists(delta_path):
    # Append to existing Delta table with schema merging
    print("Delta table exists. Appending new data.")
    df.write.format("delta").mode("append").option("mergeSchema", "true").option("enableColumnMapping", "true").save(delta_path)
else:
    # Create a new Delta table
    print("Delta table does not exist. Creating a new Delta table.")
    df.write.format("delta").mode("overwrite").save(delta_path)

# Optimize Delta table to maintain a single file
spark.sql(f"OPTIMIZE delta.`{delta_path}`")

print("Data processed successfully.")