# Clean Old Files

The main goal of this notebook is to clean the old files in the Protheus raw data lakehouse, following these rules:

- Files from the current month will not be deleted.
- For all previous months, all files will be deleted except for those from the last day of each month.

In other words, we are retaining all daily copies from the current month and also retaining the files from the last day of each previous month.

In [1]:
from datetime import datetime

StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 3, Finished, Available)

In [2]:
# This function parses the `fileInfo` object of each file, extracting only the table names and saving them all in a list.
def extract_names(file_info_list):
    names = []
    for file_info in file_info_list:
        name = file_info.name
        names.append(name)
    return names


StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 4, Finished, Available)

In [3]:
'''
This cell is the main logic of this notebook.

It generates a list of all files that should be deleted.

This function active this by interating over the paths inside the lakehouse

 removing the last month and the last day of all folders. 
'''

def generate_files_path():
    tables_file_info = mssparkutils.fs.ls('Files/')
    list_of_tables = extract_names(tables_file_info)
    paths_to_be_deleted = []
    #list all tables inside Files/
    for table in list_of_tables:
        
        year_path = f'Files/{table}/'
        year_folders = mssparkutils.fs.ls(year_path)
        list_of_year_folders = extract_names(year_folders)
        
        # list all yeas inside each File/{table}/ path
        for year in list_of_year_folders:
         
            month_path = f'Files/{table}/{year}/'
            month_folders = mssparkutils.fs.ls(month_path)
            list_of_month_folders = extract_names(month_folders)
            
            # list all months inside each Files/{table}/{year}/ path except the current month
            for month in list_of_month_folders[:-1]:
         
                day_path = f'Files/{table}/{year}/{month}/'
                day_folders = mssparkutils.fs.ls(day_path)
                list_of_day_folders = extract_names(day_folders)
                
                # list all days inside each Files/{table}/{year}/{month}/ 
                #path except the last day and save each final file path in a list
                for day in list_of_day_folders[:-1]:
         
                    day_path = f'Files/{table}/{year}/{month}/{day}'
                    paths_to_be_deleted.append(day_path)

    return paths_to_be_deleted



StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 5, Finished, Available)

In [4]:
# delete all files in a list of lakehouse Files path 
def delete_files_from_list(paths_to_be_deleted):
    for path in paths_to_be_deleted:
        mssparkutils.fs.rm(f'{path}',True)

StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 6, Finished, Available)

In [5]:
# Save paths of all files to be deleted
paths_to_be_deleted = generate_files_path()
paths_to_be_deleted

StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 7, Finished, Available)

[]

In [6]:
# delete the files 
log_deleted = delete_files_from_list(paths_to_be_deleted)

StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 8, Finished, Available)

In [10]:
log = [paths_to_be_deleted,log_deleted]
output = {}
output['log'] = f'Deleted tables in ({log})'
print(output)
mssparkutils.notebook.exit(output) 

StatementMeta(, 02c43558-0de8-441e-b7d3-2a0dc17c74c6, 12, Finished, Available)

KeyError: 'ExitValue'