#### Workspace and Lakehouse Utility Functions

This section contains utility functions to resolve workspace and lakehouse IDs using semantic-link (sempy) library.

In [None]:
import sempy.fabric as fabric
SqlEndpoint_id = fabric.resolve_item_id(lakehouse_name, 'SqlEndpoint', workspace_id)

print(f"Current workspace ID: {workspace_id}")
print(f"Current lakehouse ID: {lakehouse_id}")
print(f"Current SqlEndpoint ID: {SqlEndpoint_id}")


In [None]:
from pyspark.sql.functions import col, lit, current_timestamp
from datetime import datetime
import uuid


maintenance_folder = "Files/Maintenance/TablesMaintenance/"
current_date = datetime.now().strftime('%Y-%m-%d')
maintenance_date = datetime.now().strftime('%Y%m%d')
maintenance_date_folder = f"maintenancedate_{current_date}"
distinct_folder = str(uuid.uuid4())[:8]




#### Performance

In [None]:
%%spark
val specificpath="PerfBaseline"

In [None]:
%%pyspark

#make configurations available in scala

spark.conf.set("SqlEndpointID",SqlEndpoint_id)
spark.conf.set("workspaceID",workspace_id)
spark.conf.set("maintenanceFolder",maintenance_folder)
spark.conf.set("maintenanceDateFolder",maintenance_date_folder)
spark.conf.set("lakehouseName",lakehouse_name)
spark.conf.set("distinctFolder",distinct_folder)

In [None]:
%%spark
// load config values in scala

val SqlEndpointID = spark.conf.get("SqlEndpointID")
val workspaceID = spark.conf.get("workspaceID")
val maintenanceFolder = spark.conf.get("maintenanceFolder")
val maintenanceDateFolder = spark.conf.get("maintenanceDateFolder")
val lakehouseName = spark.conf.get("lakehouseName")
val distinctFolder = spark.conf.get("distinctFolder")

val fullpath = s"$maintenanceFolder$maintenanceDateFolder/$specificpath/$distinctFolder"
println(s"workspaceID: $workspaceID")
println(s"Performance data will be saved to: $fullpath")

In [None]:
%%spark
import java.time.LocalDate
import java.time.format.DateTimeFormatter

// Subtract 7 days from the current date
val dateSevenDaysAgo = LocalDate.now().minusDays(7)

val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
val dateString = dateSevenDaysAgo.format(formatter)


In [None]:
%%spark
val sqlprepare1=s""""with qry as (
    select distributed_statement_id, program_name, data_scanned_disk_mb, data_scanned_memory_mb,data_scanned_remote_storage_mb,
    REPLACE(REPLACE(command, CHAR(13), ''), CHAR(10), '') as command,
    total_elapsed_time_ms, start_time, end_time,allocated_cpu_time_ms, status,row_count,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.DatasetId') 
            ELSE NULL 
        END AS DatasetId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].ReportId') 
            ELSE NULL 
        END AS ReportId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].VisualId') 
            ELSE NULL 
        END AS VisualId, label
    from queryinsights.exec_requests_history
    where program_name in ('Core .Net SqlClient Data Provider','.Net SqlClient Data Provider','Framework Microsoft SqlClient Data Provider','PowerBIPremium-DirectQuery','Microsoft JDBC Driver for SQL Server')
    and start_time > '$dateString' and command not like '%sys.sp_set_session_context%' and status='Succeeded' 
),
num as (
    select *, row_number() over (partition by DatasetId, ReportId order by total_elapsed_time_ms DESC) as rownum
    from qry
)
""""

val sqlprepare2=s""""with qry2 as (
    select distributed_statement_id,program_name,data_scanned_disk_mb,data_scanned_memory_mb,data_scanned_remote_storage_mb,
    REPLACE(REPLACE(command, CHAR(13), ''), CHAR(10), '') as command,
    total_elapsed_time_ms, start_time, end_time,allocated_cpu_time_ms, status,row_count,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.DatasetId') 
            ELSE NULL 
        END AS DatasetId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].ReportId') 
            ELSE NULL 
        END AS ReportId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].VisualId') 
            ELSE NULL 
        END AS VisualId, label
    from queryinsights.exec_requests_history
    where program_name in ('Core .Net SqlClient Data Provider','.Net SqlClient Data Provider','Framework Microsoft SqlClient Data Provider','PowerBIPremium-DirectQuery','Microsoft JDBC Driver for SQL Server')
    and start_time > '$dateString' and command not like '%sys.sp_set_session_context%' and status='Succeeded' 
),
num as (
    select *, row_number() over (partition by DatasetId, ReportId order by data_scanned_memory_mb DESC) as rownum
    from qry2
) 
""""

val sqlprepare3=s""""with qry3 as (
    select distributed_statement_id,program_name,data_scanned_disk_mb,data_scanned_memory_mb,data_scanned_remote_storage_mb,
    REPLACE(REPLACE(command, CHAR(13), ''), CHAR(10), '') as command,
    total_elapsed_time_ms, start_time, end_time,allocated_cpu_time_ms, status,row_count,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.DatasetId') 
            ELSE NULL 
        END AS DatasetId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].ReportId') 
            ELSE NULL 
        END AS ReportId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].VisualId') 
            ELSE NULL 
        END AS VisualId, label
    from queryinsights.exec_requests_history
    where program_name in ('Core .Net SqlClient Data Provider','.Net SqlClient Data Provider','Framework Microsoft SqlClient Data Provider','PowerBIPremium-DirectQuery','Microsoft JDBC Driver for SQL Server')
    and start_time > '$dateString' and command not like '%sys.sp_set_session_context%' and status='Succeeded' 
),
num as (
    select *, row_number() over (partition by DatasetId, ReportId order by data_scanned_remote_storage_mb DESC) as rownum
    from qry3
) 
""""

val sqlprepare4=s""""with qry4 as (
    select distributed_statement_id,program_name,data_scanned_disk_mb,data_scanned_memory_mb,data_scanned_remote_storage_mb,
    REPLACE(REPLACE(command, CHAR(13), ''), CHAR(10), '') as command,
    total_elapsed_time_ms, start_time, end_time,allocated_cpu_time_ms, status,row_count,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.DatasetId') 
            ELSE NULL 
        END AS DatasetId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].ReportId') 
            ELSE NULL 
        END AS ReportId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].VisualId') 
            ELSE NULL 
        END AS VisualId, label
    from queryinsights.exec_requests_history
    where program_name in ('Core .Net SqlClient Data Provider','.Net SqlClient Data Provider','Framework Microsoft SqlClient Data Provider','PowerBIPremium-DirectQuery','Microsoft JDBC Driver for SQL Server')
    and start_time > '$dateString' and command not like '%sys.sp_set_session_context%' and status='Succeeded' 
),
num as (
    select *, row_number() over (partition by DatasetId, ReportId order by allocated_cpu_time_ms DESC) as rownum
    from qry4
) 
""""

val sqlprepare5=s""""with qry5 as (
    select distributed_statement_id,program_name,data_scanned_disk_mb,data_scanned_memory_mb,data_scanned_remote_storage_mb,
    REPLACE(REPLACE(command, CHAR(13), ''), CHAR(10), '') as command,
    total_elapsed_time_ms, start_time, end_time,allocated_cpu_time_ms, status,row_count,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.DatasetId') 
            ELSE NULL 
        END AS DatasetId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].ReportId') 
            ELSE NULL 
        END AS ReportId,
        CASE 
            WHEN ISJSON(label) = 1 THEN JSON_VALUE(label, '$$.Sources[0].VisualId') 
            ELSE NULL 
        END AS VisualId, label
    from queryinsights.exec_requests_history
    where program_name in ('Core .Net SqlClient Data Provider','.Net SqlClient Data Provider','Framework Microsoft SqlClient Data Provider','PowerBIPremium-DirectQuery','Microsoft JDBC Driver for SQL Server')
    and start_time > '$dateString' and command not like '%sys.sp_set_session_context%' and status='Succeeded' 
),
num as (
    select *, row_number() over (partition by DatasetId, ReportId order by row_count DESC) as rownum
    from qry5
) 
"""

val mainquery="select * from num where rownum <21"

case class Query(name: String, query: String)


val queries = Seq(
  Query("TopEllapsedTime", sqlprepare1),
  Query("TopScannedMemory", sqlprepare2),
  Query("TopRemoteStorage", sqlprepare3),
  Query("TopAllocatedCPU", sqlprepare4),
  Query("TopRowCount", sqlprepare5)
)

In [None]:
%%pyspark

lh_name = lakehouse_name 
workspaceID= workspace_id

spark.conf.set("lh_name",lh_name)
spark.conf.set("workspaceID",workspace_id)

In [None]:
%%spark
import com.microsoft.spark.fabric.tds.implicits.read.FabricSparkTDSImplicits._
import com.microsoft.spark.fabric.Constants

val lh_name = spark.conf.get("lh_name")
val workspaceID = spark.conf.get("workspaceID")

queries.foreach { qry =>
    val query = qry.query.trim.stripPrefix("\"").stripSuffix("\"")
    println(s" Running Query: $query")
    
    val df = spark.read
      .option(Constants.WorkspaceId, workspaceID)
      .option(Constants.DatabaseName, lh_name)
      .option("prepareQuery", query)
      .synapsesql(mainquery)

    val fname = qry.name
    val finalfullpath = fullpath + s"/$fname"
    
    df.write.format("parquet").save(finalfullpath)
    println(s"✅ Saved to: $finalfullpath")
}


println("\n🎯 Process Completed.")
println(s"🔗 Workspace ID: $workspaceID")
println(s"🏠 Lakehouse Name: $lh_name")
