In [None]:
beforeAfter="before"

distinct_folder=""

In [None]:
import uuid
import re

guid = str(uuid.uuid4())
if distinct_folder=="":
    distinct_folder = re.sub(r'[^a-zA-Z0-9]', '', guid)

In [None]:
import os
from datetime import datetime

if lakehouse_path:
    try:
        print(f"🔍 Analyzing folder structure for lakehouse: {lakehouse_name}")
        
        local_path = lakehouse_path.replace("file://", "")
        
        base_path = os.path.join(local_path, "Tables")
        
        def analyze_folder_structure(base_path):
            result = []
            for table in os.listdir(base_path):
                table_path = os.path.join(base_path, table)
                if os.path.isdir(table_path):
                    total_partitions = 0
                    total_files = 0
                    
                    for root, dirs, files in os.walk(table_path):
                        relative_path = os.path.relpath(root, table_path)
                        partition_level = relative_path.split(os.sep)
                        partition_name = "/".join(partition_level)
                        num_files = len(files)
                        result.append({
                            "table": table,
                            "partition": partition_name,
                            "num_files": num_files,
                            "partition_level": len(partition_level)
                        })
                        if 'delta_log' not in partition_name:
                            if partition_name!='.':
                                total_partitions += 1
                            total_files += num_files
                    
                    result.append({
                        "table": table,
                        "partition": "TOTAL",
                        "num_files": total_files,
                        "partition_level": "N/A"
                    })
            
            return result

        analysis_result = analyze_folder_structure(base_path)

        if analysis_result:
                df = spark.createDataFrame(analysis_result)
                print("✅ Analysis complete! Displaying results:")
                df.show(truncate=False)

                print(f"\n📈 Summary for lakehouse '{lakehouse_name}':")
                tables_analyzed = len(set([item['table'] for item in analysis_result if item['partition'] != 'TOTAL']))
                total_partitions = sum([item['num_files'] for item in analysis_result if item['partition'] == 'TOTAL'])
                print(f"  - Source lakehouse: {lakehouse_name}")
                print(f"  - Tables analyzed: {tables_analyzed}")
                print(f"  - Total files across all tables: {total_partitions}")
                
    except Exception as e:
        print(f"❌ Error analyzing folder structure: {str(e)}")
else:
    print("❌ Cannot analyze folder structure - lakehouse not mounted")

In [None]:
from pyspark.sql.functions import col, lit, current_timestamp
from datetime import datetime
import uuid

if 'df' in locals() and df is not None and lakehouse_path:
        print("💾 Preparing to save analysis results to the default lakehouse...")
        maintenance_folder = "Files/Maintenance/TablesMaintenance/"
        current_date = datetime.now().strftime('%Y-%m-%d')
        maintenance_date = datetime.now().strftime('%Y%m%d')
        maintenance_date_folder = f"maintenancedate_{current_date}"

        df2 = df.filter(
            (col("partition") == 'TOTAL') |
            (col("partition") == '.') |
            (col("partition").contains('delta'))
        )

        df.write.format('parquet').save(
            f"{maintenance_folder}{maintenance_date_folder}/TableAnalysis/{distinct_folder}/{beforeAfter}/tablesFullanalysis/"
        )
        df2.write.format('parquet').save(
            f"{maintenance_folder}{maintenance_date_folder}/TableAnalysis/{distinct_folder}/{beforeAfter}/tablesSize/"
        )
        mssparkutils.notebook.exit(distinct_folder)

