In [0]:
df_metadata = spark.table("workspace.users.employee_and_sales_metadata")
display(df_metadata)

ProcessName,RawTableName,RawTableColumn,RawTableColumnDataType,Logic,CuratedTableName,CuratedTableColumn,CuratedTableColumnDataType
EmployeeInfo,Employee,ID,STRING,ID is NOT NULL,cur_Employee,EmployeeID,STRING
EmployeeInfo,Employee,Name,STRING,,cur_Employee,EmployeeName,STRING
SalesInfo,Sales,ID,INTEGER,ID is NOT NULL,cur_Sales,ItemID,INTEGER
SalesInfo,Sales,Name,STRING,,cur_Sales,ItemName,STRING
SalesInfo,Sales,Txn_Date,DATE,,cur_Sales,TransactionDate,DATE


In [0]:
def drop_and_create_tables(tables_dict):
    for table_name, columns in tables_dict.items():
        col_def = ", ".join(columns)
        drop_sql = f"DROP TABLE IF EXISTS {table_name}"
        create_sql = f"CREATE TABLE {table_name} ({col_def})"
        
        spark.sql(drop_sql)
        spark.sql(create_sql)
        print(f"Created Table: {table_name}")

In [0]:
from collections import defaultdict
raw_tables = defaultdict(list)
curated_tables = defaultdict(list)
for row in df_metadata.collect():
    #print(row)
    table_name = row['RawTableName']
    col_def = f"{row['RawTableColumn']} {row['RawTableColumnDataType']}"
    raw_tables[table_name].append(col_def)
    table_name = row['CuratedTableName']
    col_def = f"{row['CuratedTableColumn']} {row['CuratedTableColumnDataType']}"
    curated_tables[table_name].append(col_def)
drop_and_create_tables(raw_tables)
drop_and_create_tables(curated_tables)

Created Table: Employee
Created Table: Sales
Created Table: cur_Employee
Created Table: cur_Sales


In [0]:
%sql
insert into default.employee values('201', 'Neha'), ('202', 'Nakul');
insert into default.employee(Name) values('Shreya');
insert into default.sales values(101, 'Curd', '2025-07-05'), (102, 'Jam', '2025-07-09'), (103, 'Ketchup', '2025-06-09');
insert into default.sales(Name, Txn_Date) values('Ghee', '2025-06-21');

num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql
Select * from default.employee;

ID,Name
201.0,Neha
202.0,Nakul
,Shreya


In [0]:
%sql
Select * from default.sales

ID,Name,Txn_Date
101.0,Curd,2025-07-05
102.0,Jam,2025-07-09
103.0,Ketchup,2025-06-09
,Ghee,2025-06-21


In [0]:
dbutils.widgets.text("process_name", " ", "Enter process name")
process_name = dbutils.widgets.get("process_name")
input_process_names = [name.strip() for name in process_name.split(",") if name.strip()]
filtered_df = df_metadata.filter(df_metadata['processname'].isin(input_process_names))
display(filtered_df)

ProcessName,RawTableName,RawTableColumn,RawTableColumnDataType,Logic,CuratedTableName,CuratedTableColumn,CuratedTableColumnDataType
EmployeeInfo,Employee,ID,STRING,ID is NOT NULL,cur_Employee,EmployeeID,STRING
EmployeeInfo,Employee,Name,STRING,,cur_Employee,EmployeeName,STRING
SalesInfo,Sales,ID,INTEGER,ID is NOT NULL,cur_Sales,ItemID,INTEGER
SalesInfo,Sales,Name,STRING,,cur_Sales,ItemName,STRING
SalesInfo,Sales,Txn_Date,DATE,,cur_Sales,TransactionDate,DATE


In [0]:
from pyspark.sql.functions import col
raw_tables_list = [row["RawTableName"] for row in filtered_df.select("RawTableName").distinct().collect()]

for raw_table in raw_tables_list:
    table_meta = filtered_df.filter(filtered_df['RawTableName'] == raw_table).collect()
    print(table_meta)
    raw_df = spark.table(raw_table)
    display(raw_df)
    raw_columns = raw_df.columns  # for validation
    print(raw_columns)
    select_exprs = []
    logic_conditions = []

    for row in table_meta:
        raw_col = row['RawTableColumn']
        curated_col = row['CuratedTableColumn']
        logic = row['Logic']

        if raw_col in raw_columns:
            select_exprs.append(col(raw_col).alias(curated_col))
        else:
            print(f"Column `{raw_col}` not found in table `{raw_table}`. Skipping.")
            continue

        if logic and logic.strip().lower() != "null":
            logic_conditions.append(logic)

    if not select_exprs:
        print(f"No valid columns found for table `{raw_table}`. Skipping.")
        continue
    # print(select_exprs)
    # print(logic_conditions)

    final_df = raw_df.filter(" AND ".join(logic_conditions)) if logic_conditions else raw_df
    final_df = final_df.select(*select_exprs)

    curated_table = table_meta[0]['CuratedTableName']
    final_df.write.mode("append").insertInto(curated_table)
    display(final_df)
    print(f"Transferred data from `{raw_table}` to `{curated_table}`")


[Row(ProcessName='EmployeeInfo', RawTableName='Employee', RawTableColumn='ID', RawTableColumnDataType='STRING', Logic='ID is NOT NULL', CuratedTableName='cur_Employee', CuratedTableColumn='EmployeeID', CuratedTableColumnDataType='STRING'), Row(ProcessName='EmployeeInfo', RawTableName='Employee', RawTableColumn='Name', RawTableColumnDataType='STRING', Logic=None, CuratedTableName='cur_Employee', CuratedTableColumn='EmployeeName', CuratedTableColumnDataType='STRING')]


ID,Name
201.0,Neha
202.0,Nakul
,Shreya


['ID', 'Name']
[Column<'ID AS EmployeeID'>, Column<'Name AS EmployeeName'>]
['ID is NOT NULL']


EmployeeID,EmployeeName
201,Neha
202,Nakul


Transferred data from `Employee` to `cur_Employee`
[Row(ProcessName='SalesInfo', RawTableName='Sales', RawTableColumn='ID', RawTableColumnDataType='INTEGER', Logic='ID is NOT NULL', CuratedTableName='cur_Sales', CuratedTableColumn='ItemID', CuratedTableColumnDataType='INTEGER'), Row(ProcessName='SalesInfo', RawTableName='Sales', RawTableColumn='Name', RawTableColumnDataType='STRING', Logic=None, CuratedTableName='cur_Sales', CuratedTableColumn='ItemName', CuratedTableColumnDataType='STRING'), Row(ProcessName='SalesInfo', RawTableName='Sales', RawTableColumn='Txn_Date', RawTableColumnDataType='DATE', Logic=None, CuratedTableName='cur_Sales', CuratedTableColumn='TransactionDate', CuratedTableColumnDataType='DATE')]


ID,Name,Txn_Date
101.0,Curd,2025-07-05
102.0,Jam,2025-07-09
103.0,Ketchup,2025-06-09
,Ghee,2025-06-21


['ID', 'Name', 'Txn_Date']
[Column<'ID AS ItemID'>, Column<'Name AS ItemName'>, Column<'Txn_Date AS TransactionDate'>]
['ID is NOT NULL']


ItemID,ItemName,TransactionDate
101,Curd,2025-07-05
102,Jam,2025-07-09
103,Ketchup,2025-06-09


Transferred data from `Sales` to `cur_Sales`


In [0]:
%sql
select * from cur_sales;


ItemID,ItemName,TransactionDate
101,Curd,2025-07-05
102,Jam,2025-07-09
103,Ketchup,2025-06-09


In [0]:
%sql
Select * from cur_Employee;

EmployeeID,EmployeeName
201,Neha
202,Nakul
