#### Load TableSizeAnalysis data

In [None]:
from datetime import datetime
from pyspark.sql.functions import lit, col

# Define the starting date
start_date = datetime.strptime("2024-12-26", "%Y-%m-%d")

# Define the base path of the Lakehouse
base_path = "Files/Maintenance/TablesMaintenance"

def process_data(base_path, time_period, start_date):
    final_combined_df = None

    try:
        maintenance_date_folders = mssparkutils.fs.ls(base_path)
        for folder in maintenance_date_folders:
            folder_name = folder.name

            # Check if the folder name starts with "maintenancedate_"
            if folder_name.startswith("maintenancedate_"):
                # Extract the date part from the folder name
                folder_date_str = folder_name.replace("maintenancedate_", "").strip("/")
                folder_date = datetime.strptime(folder_date_str, "%Y-%m-%d")

                # Process only folders with dates on or after the start date
                if folder_date >= start_date:
                    maintenance_date_path = folder.path

                    # Append 'TableAnalysis' to the maintenance_date_path
                    table_analysis_path = f"{maintenance_date_path}/TableAnalysis"

                    # List all subfolders under the TableAnalysis folder
                    try:
                        subfolders = mssparkutils.fs.ls(table_analysis_path)
                        for subfolder in subfolders:
                            subfolder_path = subfolder.path

                            # Path to the "tablesSize" folder for the given time period
                            tables_fullanalysis_path = f"{subfolder_path}/{time_period}/tablesSize"

                            # Check if the path exists
                            if mssparkutils.fs.exists(tables_fullanalysis_path):
                                try:
                                    # Read the parquet files from the folder
                                    df = spark.read.parquet(tables_fullanalysis_path)

                                    # Add the maintenance date as a new column
                                    df = df.withColumn("MaintenanceDate", lit(folder_date_str).cast("date"))

                                    # Combine data from all folders
                                    if final_combined_df is None:
                                        final_combined_df = df
                                    else:
                                        final_combined_df = final_combined_df.union(df)

                                    print(f"Successfully read data from {tables_fullanalysis_path}")
                                except Exception as e:
                                    print(f"Failed to read data from {tables_fullanalysis_path}: {e}")
                            else:
                                print(f"Path does not exist: {tables_fullanalysis_path}. Skipping...")
                    except Exception as e:
                        print(f"Failed to list subfolders in {table_analysis_path}: {e}")
    except Exception as e:
        print(f"Failed to list maintenancedate_* subfolders in {base_path}: {e}")

    if final_combined_df is not None:
        final_combined_df = final_combined_df.withColumn("MaintenanceTime", lit(time_period.capitalize()))

    return final_combined_df

# Process data for "before" and "after"
before_final_combined_df = process_data(base_path, "before", start_date)
after_final_combined_df = process_data(base_path, "after", start_date)

# Combine both DataFrames
df_tableSize = None
if before_final_combined_df and after_final_combined_df:
    df_tableSize = before_final_combined_df.union(after_final_combined_df)
elif before_final_combined_df:
    df_tableSize = before_final_combined_df
elif after_final_combined_df:
    df_tableSize = after_final_combined_df

# Write the combined DataFrame back to the Delta table
delta_table_path = "Tables/Maintenance_tablesSize"
if df_tableSize is not None:
    df_tableSize.write.format("delta").mode("overwrite").save(delta_table_path)
    print(f"Data successfully loaded into Delta table at {delta_table_path}")
else:
    print("No data was loaded. Check the paths and files in the Lakehouse.")


StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 3, Finished, Available, Finished)

Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/TableAnalysis/e854310e14a84c4aae15d3dff7412f90/before/tablesSize
Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/TableAnalysis/e854310e14a84c4aae15d3dff7412f90/after/tablesSize


Data successfully loaded into Delta table at Tables/Maintenance_tablesSize


#### Load Performance Baseline Data

In [None]:
from datetime import datetime
from pyspark.sql.functions import lit, col
from pyspark.sql import DataFrame

# Define the base path of the Lakehouse
base_path = "Files/Maintenance/TablesMaintenance"

# Define the starting date
start_date = datetime.strptime("2025-01-21", "%Y-%m-%d")

# Helper function to read and process parquet files
def process_folder(base_path: str, subfolder_name: str, performance_type: str, start_date: datetime) -> DataFrame:
    df_result = None
    try:
        maintenance_date_folders = mssparkutils.fs.ls(base_path)
        for folder in maintenance_date_folders:
            folder_name = folder.name
            # Check if the folder name starts with "maintenancedate_"
            if folder_name.startswith("maintenancedate_"):
                folder_date_str = folder_name.replace("maintenancedate_", "").strip("/")
                folder_date = datetime.strptime(folder_date_str, "%Y-%m-%d")

                # Process only folders with dates on or after the start date
                if folder_date >= start_date:
                    maintenance_date_path = folder.path
                    perf_baseline_path = f"{maintenance_date_path}/PerfBaseline"

                    try:
                        guid_folders = mssparkutils.fs.ls(perf_baseline_path)
                        for guid_folder in guid_folders:
                            guid_path = guid_folder.path

                            # Path to the performance metric folder (no 'before' folder anymore)
                            target_path = f"{guid_path}/{subfolder_name}"

                            # Check if the target path exists
                            if mssparkutils.fs.exists(target_path):
                                try:
                                    # Read the parquet files
                                    df_temp = spark.read.parquet(target_path)
                                    df_temp = (
                                        df_temp
                                        .withColumn("MaintenanceDate", lit(folder_date_str).cast("date"))
                                        .withColumn("PerformanceType", lit(performance_type))
                                    )

                                    # Append or union the DataFrame
                                    df_result = df_temp if df_result is None else df_result.union(df_temp)
                                    print(f"Successfully read data from {target_path}")
                                except Exception as e:
                                    print(f"Failed to read data from {target_path}: {e}")
                            else:
                                print(f"Path does not exist: {target_path}. Skipping...")
                    except Exception as e:
                        print(f"Failed to list GUID folders in {perf_baseline_path}: {e}")
    except Exception as e:
        print(f"Failed to list maintenancedate_* subfolders in {base_path}: {e}")
    return df_result

# Process each performance type
df_AllocatedCPU = process_folder(base_path, "TopAllocatedCPU", "Allocated CPU", start_date)
df_EllapsedTime = process_folder(base_path, "TopEllapsedTime", "EllapsedTime", start_date)
df_RemoteStorage = process_folder(base_path, "TopRemoteStorage", "RemoteStorage", start_date)
df_RowCount = process_folder(base_path, "TopRowCount", "RowCount", start_date)
df_ScannedMemory = process_folder(base_path, "TopScannedMemory", "ScannedMemory", start_date)

# Combine all performance types into a single DataFrame
df_Maintenance_Performance = (
    df_AllocatedCPU
    .union(df_EllapsedTime)
    .union(df_RemoteStorage)
    .union(df_RowCount)
    .union(df_ScannedMemory)
)
df_Maintenance_Performance = df_Maintenance_Performance.drop("label", "rownum")


StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 4, Finished, Available, Finished)

Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/PerfBaseline/9959b459/TopAllocatedCPU
Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/PerfBaseline/9959b459/TopEllapsedTime


Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/PerfBaseline/9959b459/TopRemoteStorage
Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/PerfBaseline/9959b459/TopRowCount
Successfully read data from abfss://0409393c-944d-4b64-9222-d4017c7466d0@onelake.dfs.fabric.microsoft.com/83037cfc-6791-4ac4-ae26-38075549718d/Files/Maintenance/TablesMaintenance/maintenancedate_2025-07-14/PerfBaseline/9959b459/TopScannedMemory


### **Translate Dataset name and Report name**

#### Specify the workspace(s) to the Dataset name and Report Name. 
For example:
#####  List of workspaces 
workspaces = ["workspace 1", "workspace 2"]   # Add many as required


In [None]:
#  List of workspaces 
workspaces = ["OnyxTools-Test"]

StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 5, Finished, Available, Finished)

In [None]:
# Translate Dataset Names

import sempy.fabric as fabric
from pyspark.sql.functions import lit
# Function to generate a DataFrame for a given workspace
def create_dataset_dataframe(workspace_name):
    dataset_data = fabric.list_datasets(workspace_name)
    return (
        spark.createDataFrame(dataset_data)
        .withColumnRenamed("Dataset ID", "DatasetID")
        .withColumnRenamed("Dataset Name", "DatasetName")
        .withColumn("Dataset_Workspace", lit(workspace_name))
    )

# Generate and union all report DataFrames
dataset_Name_df = None
for workspace in workspaces:
    workspace_df = create_dataset_dataframe(workspace)
    dataset_Name_df = workspace_df if dataset_Name_df is None else dataset_Name_df.union(workspace_df)
    # display(dataset_Name_df)

StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 6, Finished, Available, Finished)

In [None]:
# Translate Report Names

import sempy.fabric as fabric
from pyspark.sql.functions import lit

# Function to generate a DataFrame for a given workspace
def create_report_dataframe(workspace_name):
    report_data = fabric.list_reports(workspace_name)
    return (
        spark.createDataFrame(report_data)
        .withColumnRenamed("Id", "ReportId")
        .withColumnRenamed("Name", "Report_Name")
        .withColumn("Report_Workspace", lit(workspace_name))
    )

# Generate and union all report DataFrames
report_name_df = None
for workspace in workspaces:
    workspace_df = create_report_dataframe(workspace)
    report_name_df = workspace_df if report_name_df is None else report_name_df.union(workspace_df)

StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 7, Finished, Available, Finished)

### Joining Performance Data to Report Name and Dataset name

In [None]:
# Join with dataset DataFrame 
dataset_df = (
    df_Maintenance_Performance.join(
        dataset_Name_df.select("DatasetId", "DatasetName"), 
        on="DatasetId", 
        how="left"
    )
    .select(df_Maintenance_Performance["*"], dataset_Name_df["DatasetName"])
)

# Final join with report DataFrame
final_df = (
    dataset_df.join(
        report_name_df.select("ReportId", "Report_Name", "Report_Workspace"),
        on="ReportId",
        how="left"
    )
    .select(
        dataset_df["*"],
        report_name_df["Report_Name"],
        report_name_df["Report_Workspace"]
    )
)

#display(final_df)

StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 8, Finished, Available, Finished)

In [None]:
# Write the combined DataFrame back to the Delta table
delta_table_path = "Tables/Maintenance_Performance"

if final_df is not None:
    final_df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(delta_table_path)
    print(f"Data successfully loaded into Delta table at {delta_table_path}")
else:
    print("No data was loaded. Check the paths and files in the Lakehouse.")


StatementMeta(, bd6ccf15-ba85-4fd2-bb04-57318b29a052, 9, Finished, Available, Finished)

Data successfully loaded into Delta table at Tables/Maintenance_Performance
