In [0]:
%run "/Workspace/Users/ruchika.b.mhetre@v4c.ai/vstone_project/vstone_databricks_pipeline/src/notebooks/00_Setup/project_config"

In [0]:
from pyspark.sql.functions import col, count, when, current_timestamp
import pandas as pd

# Data Ingestion

In [0]:
# 1. Main Transactions
df_main = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{volume_path}/1_main.csv")


In [0]:
# 2. Catalogs 
df_catalogs = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", ";") \
    .option("inferSchema", "true") \
    .load(f"{volume_path}/catalogs.csv")

In [0]:
# 3. Geographic Data
df_geo = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{volume_path}/final_geographic.csv")

print("All datasets loaded successfully.")

# Visual Inspection

In [0]:
print("Previewing Main Transactions (Top 5):")
display(df_main.limit(5))

# Schema & Structural Profiling

In [0]:
print("--- Main Transactions Schema ---")
df_main.printSchema()

print("--- Catalog Schema ---")
df_catalogs.printSchema()

# Null Value Analysis (Quality Check)

In [0]:
print("--- Null Values Count across Main Dataset ---")
null_counts = df_main.select([count(when(col(c).isNull(), c)).alias(c) for c in df_main.columns])

display(null_counts)

In [0]:
total_rows = df_main.count()
distinct_ids = df_main.select("id").distinct().count()
duplicate_count = total_rows - distinct_ids

print(f"Total Rows: {total_rows}")
print(f"Distinct IDs: {distinct_ids}")
print(f"Number of duplicate IDs: {duplicate_count}")

if duplicate_count > 0:
    print("ACTION REQUIRED: Duplicate IDs found.")
else:
    print("PK Integrity: No duplicate IDs found.")