In [0]:
# ResellerKey','GeographyKey','ResellerAlternateKey','Phone','BusinessType','ResellerName','NumberEmployees','OrderFrequency','OrderMonth','FirstOrderYear','LastOrderYear','ProductLine','AddressLine1','AddressLine2','AnnualSales','BankName','MinPaymentType','MinPaymentAmount','AnnualRevenue','YearOpened'	Sales.Store.csv :["BusinessEntityID", "Name", "SalesPersonID", "Demographics", "rowguid", "ModifiedDate"]	Sales.Customer.csv:["CustomerID", "PersonID", "StoreID", "TerritoryID", "AccountNumber", "rowguid", "ModifiedDate"]	Person.Address.csv:["AddressID", "AddressLine1", "AddressLine2", "City", "StateProvinceID", "PostalCode", "SpatialLocation", "rowguid", "ModifiedDate"]

In [0]:
# 1. SETUP: Imports and Configurations... (omitted for brevity)
from pyspark.sql.functions import (
    col, lit, current_timestamp, trim, to_timestamp, coalesce, sha2, 
    concat_ws, year
)
from delta.tables import DeltaTable
import datetime
import sys

# --- CONFIGURATION ---

# 1a. Define File Locations (Paths) - ASSUMED LOCATIONS
SOURCE_PATH_STORE = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Store/"
SOURCE_PATH_CUSTOMER = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Customer/"
SOURCE_PATH_ADDRESS = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Person.Address/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/dim/dim_reseller/"

# 1b. Define the Unique Identifier (Primary Key)
PK_RAW = "ResellerAlternateKey"
PRIMARY_KEYS = [c.strip() for c in PK_RAW.split(",") if c.strip()]

# 1d. Setup Storage Access (Authentication) - REPLACE WITH YOUR KEY!
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", account_key)
PK_COLS = PRIMARY_KEYS
print(f"Using Primary Key for MERGE: {PK_COLS}")

In [0]:
try:
    # 2a. Read Source DataFrames
    df_store = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_STORE)
    df_customer = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_CUSTOMER)
    df_address = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_ADDRESS)
except Exception as e:
    print(f"Error reading source data. Exiting: {e}")
    sys.exit(1)

# ===> ACTION: PRINT SCHEMA HERE TO CONFIRM COLUMN NAMES <===
print("\n--- SCHEMA CHECK: df_store ---")
df_store.printSchema()
print("------------------------------\n")

# 2b. Register DataFrames as Temporary Views
df_store.createOrReplaceTempView("Store")
df_customer.createOrReplaceTempView("Customer")
df_address.createOrReplaceTempView("Address")

In [0]:
# 2c. Define and Execute the Spark SQL Join Query
print("2. Joining data using Spark SQL...")

# **!!! UPDATE THE COLUMN NAMES BELOW based on the SCHEMA CHECK !!!**
# Assuming the column is correctly named 'BusinessEntityID' as per your initial plan:
spark_sql_query = """
SELECT
    -- Keys & Basic Attributes from Store (Base Reseller)
    s.BusinessEntityID AS ResellerAlternateKey,
    s.Name AS ResellerName,
    s.Phone,
    s.ModifiedDate AS ModifiedDate_Store,
    
    -- Address Attributes
    a.AddressLine1,
    a.AddressLine2,
    a.ModifiedDate AS ModifiedDate_Addr,
    
    -- Geography Link
    c.TerritoryID AS TerritoryID_Reseller
    
FROM
    Store s
LEFT JOIN
    Customer c ON s.BusinessEntityID = c.StoreID       -- Link Store to Customer
LEFT JOIN
    Address a ON c.AddressID = a.AddressID             -- Link Customer to Address
"""
# If the column is NOT 'BusinessEntityID', change it in the SQL above. 
# For example, if it's 'StoreID', you would change s.BusinessEntityID to s.StoreID.

df_final = spark.sql(spark_sql_query)

In [0]:

# 1a. Define File Locations (Paths) - ASSUMED LOCATIONS
SOURCE_PATH_STORE = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Store/"
SOURCE_PATH_CUSTOMER = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Customer/"
SOURCE_PATH_ADDRESS = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Person.Address/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/dim/dim_reseller/"

# 1b. Define the Unique Identifier (Primary Key)
PK_RAW = "ResellerAlternateKey"
PRIMARY_KEYS = [c.strip() for c in PK_RAW.split(",") if c.strip()]

# 1d. Setup Storage Access (Authentication) - REPLACE WITH YOUR KEY!
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", account_key)
PK_COLS = PRIMARY_KEYS
print(f"Using Primary Key for MERGE: {PK_COLS}")

# 2a. Read Source DataFrames
df_store = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_STORE)
df_customer = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_CUSTOMER)
df_address = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_ADDRESS)

# 2b. Register DataFrames as Temporary Views
df_store.createOrReplaceTempView("Store")
df_customer.createOrReplaceTempView("Customer")
df_address.createOrReplaceTempView("Address")

# -------------------------------------------------------------
# *** CRITICAL DEBUGGING SECTION ***
print("\n--- SCHEMA CHECK: Store Table Columns ---")
df_store.printSchema()
print("------------------------------------------\n")
# -------------------------------------------------------------

In [0]:
%sql
spark_sql_query = """
SELECT
    -- KEYS: CHANGE 'BusinessEntityID' to 'StoreID' here
    s.StoreID AS ResellerAlternateKey, 
    s.Name AS ResellerName,
    s.Phone,
    s.ModifiedDate AS ModifiedDate_Store,
    
    -- Address Attributes
    a.AddressLine1,
    a.AddressLine2,
    a.ModifiedDate AS ModifiedDate_Addr,
    
    -- Geography Link
    c.TerritoryID AS TerritoryID_Reseller
    
FROM
    Store s
LEFT JOIN
    Customer c ON s.StoreID = c.StoreID       -- CHANGE 'BusinessEntityID' to 'StoreID' here
LEFT JOIN
    Address a ON c.AddressID = a.AddressID             
"""

In [0]:
# 1a. Define File Locations (Paths) - ASSUMED LOCATIONS
SOURCE_PATH_STORE = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Store/"
SOURCE_PATH_CUSTOMER = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Sales.Customer/"
SOURCE_PATH_ADDRESS = "abfss://project@scrgvkrmade.dfs.core.windows.net/bronze/ResellerSales/Person.Address/"
TARGET_PATH = "abfss://project@scrgvkrmade.dfs.core.windows.net/silver/dim/dim_reseller/"


# 1d. Setup Storage Access (Authentication) - REPLACE WITH YOUR KEY!
storage_account_name = "scrgvkrmade"
account_key = "E4VB7pXWFXttUWbbSBPY35/Dvsw6Fs6XgIWLTj3lCS6v/jCEow9Uxs+r6Usobhenv14UdWEzb+R8+AStNyS0dg=="
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", account_key)

# 2. Read parquet with mergeSchema (if needed) and create temp views
df_store = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_STORE)
df_customer = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_CUSTOMER)
df_address = spark.read.option("mergeSchema", "true").parquet(SOURCE_PATH_ADDRESS)

df_store.createOrReplaceTempView("src_store")
df_customer.createOrReplaceTempView("src_customer")
df_address.createOrReplaceTempView("src_address")

print("Temp views created: src_store, src_customer, src_address")
print("TARGET_PATH =", TARGET_PATH)



In [0]:
# show schema for each temp view
print("=== src_store schema ===")
spark.table("src_store").printSchema()
print("=== src_store sample rows ===")
spark.table("src_store").limit(10).toPandas()

print("=== src_customer schema ===")
spark.table("src_customer").printSchema()
print("=== src_customer sample rows ===")
spark.table("src_customer").limit(10).toPandas()

print("=== src_address schema ===")
spark.table("src_address").printSchema()
print("=== src_address sample rows ===")
spark.table("src_address").limit(10).toPandas()


In [0]:
# Run this Python cell in your notebook (assumes src_store, src_customer, src_address temp views exist).
import re
from pyspark.sql import DataFrame

def show_schema_and_sample(view_name, n=5):
    print(f"\n--- {view_name} schema ---")
    df = spark.table(view_name)
    df.printSchema()
    print(f"--- sample rows ({view_name}) ---")
    display(df.limit(n))

# 1) show schemas & a few rows to inspect
show_schema_and_sample("src_store")
show_schema_and_sample("src_customer")
show_schema_and_sample("src_address")

# 2) helper to find candidate key columns by name heuristics
def find_candidates(cols):
    patterns = [
        r'\bstore\b', r'\bstoreid\b', r'\bstore_id\b', r'\bid\b', r'\baddressid\b',
        r'\breseller\b', r'\bbusiness\b', r'\bentity\b', r'\bkey\b'
    ]
    candidates = []
    for c in cols:
        score = 0
        lower = c.lower()
        if re.search(r'\bstoreid\b|\bstore_id\b', lower): score += 10
        if re.search(r'\breseller\b', lower): score += 8
        if re.search(r'\bbusinessentity\b|\bbusiness_entity\b', lower): score += 8
        if re.search(r'\bbusiness\b|\bentity\b', lower): score += 5
        if re.search(r'\bid\b', lower): score += 3
        if re.search(r'\bkey\b', lower): score += 2
        if score>0:
            candidates.append((c, score))
    # sort descending by score, then by name
    candidates.sort(key=lambda x: (-x[1], x[0]))
    return [c for c,s in candidates]

# get columns
cols_store = spark.table("src_store").columns
cols_cust = spark.table("src_customer").columns
cols_addr = spark.table("src_address").columns

cand_store = find_candidates(cols_store)
cand_cust = find_candidates(cols_cust)

print("\nCandidate key columns found in src_store (best first):", cand_store)
print("Candidate key columns found in src_customer (best first):", cand_cust)

# 3) try to pick best pair to join on:
chosen_store_col = None
chosen_cust_col = None

# prefer exact matching names between store and customer (case-insensitive)
lower_cust = {c.lower(): c for c in cols_cust}
for c in cand_store:
    if c.lower() in lower_cust:
        chosen_store_col = c
        chosen_cust_col = lower_cust[c.lower()]
        break

# else, if customer has obvious StoreID-like column, use it and match to the best store candidate
if not chosen_store_col and cand_cust:
    chosen_cust_col = cand_cust[0]
    # try to find store column named like chosen_cust_col
    if chosen_cust_col.lower() in {c.lower() for c in cols_store}:
        chosen_store_col = [c for c in cols_store if c.lower()==chosen_cust_col.lower()][0]
    else:
        # fallback: pick top store candidate
        chosen_store_col = cand_store[0] if cand_store else None

# If still nothing, pick the top candidates if present
if not chosen_store_col and cand_store:
    chosen_store_col = cand_store[0]
if not chosen_cust_col and cand_cust:
    chosen_cust_col = cand_cust[0]

print("\nAuto-chosen join columns:")
print("  src_store ->", chosen_store_col)
print("  src_customer ->", chosen_cust_col)

# 4) if we have candidates, create vw_reseller_source using those names (with safe quoting)
if not chosen_store_col or not chosen_cust_col:
    raise RuntimeError("No good key candidates found. Inspect the printed schemas above and pick the correct join columns.")

# Build and run the SQL to create vw_reseller_source using the chosen names.
store_col = chosen_store_col
cust_col = chosen_cust_col

sql = f"""
CREATE OR REPLACE TEMP VIEW vw_reseller_source AS
SELECT
  COALESCE(CAST(s.`{store_col}` AS BIGINT), CAST(c.`{cust_col}` AS BIGINT)) AS ResellerAlternateKey,
  trim(s.`Name`) AS ResellerName,
  trim(coalesce(s.`Phone`, 'N/A')) AS Phone,
  a.AddressLine1 AS AddressLine1,
  CASE WHEN a.AddressLine2 IS NULL OR a.AddressLine2 = '' THEN NULL ELSE a.AddressLine2 END AS AddressLine2,
  to_timestamp(coalesce(a.ModifiedDate, s.ModifiedDate)) AS ModifiedDate,
  coalesce(c.TerritoryID, -1) AS GeographyKey,
  year(to_timestamp(coalesce(a.ModifiedDate, s.ModifiedDate))) AS _year,
  current_timestamp() AS LoadTS,
  sha2(concat_ws('||',
       coalesce(trim(s.`Name`), ''),
       coalesce(trim(coalesce(s.`Phone`, 'N/A')), ''),
       coalesce(a.AddressLine1, ''),
       coalesce(a.AddressLine2, '')
     ), 256) AS __row_hash,
  current_timestamp() AS __ingest_ts,
  '{SOURCE_PATH_STORE if 'SOURCE_PATH_STORE' in globals() else ''}' AS __source_path,
  '{TARGET_PATH if 'TARGET_PATH' in globals() else ''}' AS __target_path,
  concat('Batch-', date_format(current_timestamp(), 'yyyyMMddHHmmss')) AS __batch_id,
  NULL AS ResellerKey,
  NULL AS BusinessType,
  NULL AS NumberEmployees,
  NULL AS OrderFrequency,
  NULL AS OrderMonth,
  NULL AS FirstOrderYear,
  NULL AS LastOrderYear,
  NULL AS ProductLine,
  NULL AS AnnualSales,
  NULL AS BankName,
  NULL AS MinPaymentType,
  NULL AS MinPaymentAmount,
  NULL AS AnnualRevenue,
  NULL AS YearOpened
FROM
  (SELECT DISTINCT * FROM src_store) s
LEFT JOIN
  (SELECT DISTINCT * FROM src_customer) c
  ON CAST(s.`{store_col}` AS BIGINT) = CAST(c.`{cust_col}` AS BIGINT)
LEFT JOIN
  (SELECT AddressID AS AddressID_Addr, AddressLine1, AddressLine2, ModifiedDate FROM src_address) a
  ON coalesce(c.AddressID, 0) = coalesce(a.AddressID_Addr, 0)
WHERE COALESCE(CAST(s.`{store_col}` AS BIGINT), CAST(c.`{cust_col}` AS BIGINT)) IS NOT NULL
"""
print("\n--- Creating vw_reseller_source with SQL using chosen columns ---\n")
print(sql[:1000], "...")  # print a snippet for review
spark.sql(sql)
print("\nvw_reseller_source created successfully. Run: spark.table('vw_reseller_source').show(5) to confirm.")
