In [1]:
import os
import pandas as pd

In [2]:
def list_files_and_folders(root_folder):
    result = []

    # Walk through the directory structure
    for folder_name, subfolders, filenames in os.walk(root_folder):
        # Get the relative path of the folder
        relative_path = os.path.relpath(folder_name, root_folder)
        # Split the relative path into individual folder components
        path_components = relative_path.split(os.sep)

        for filename in filenames:
            # Create a list with folder names and the file name
            file_info = path_components + [filename]
            
            # Check if the file is an Excel file
            if filename.endswith(('.xlsx', '.xls')):
                file_path = os.path.join(folder_name, filename)
                try:
                    # Read the Excel file and count the sheets
                    excel_file = pd.ExcelFile(file_path)
                    num_sheets = len(excel_file.sheet_names)

                    # Count the total number of rows across all sheets
                    total_rows = 0
                    for sheet_name in excel_file.sheet_names:
                        df = pd.read_excel(excel_file, sheet_name=sheet_name)
                        total_rows += len(df)

                except Exception as e:
                    num_sheets = f"Error: {e}"
                    total_rows = f"Error: {e}"

                # Append the number of sheets and total rows to the file info
                file_info.extend([num_sheets, total_rows])
            else:
                # Append placeholders if the file is not an Excel file
                file_info.extend([None, None])
            
            result.append(file_info)

    return result

def create_dataframe(root_folder):
    files_and_folders = list_files_and_folders(root_folder)

    # Create a DataFrame from the list of files and folders
    df = pd.DataFrame(files_and_folders)

    # Rename columns to indicate the folder levels, file name, number of sheets, and total rows
    num_columns = df.shape[1]
    column_names = [f'Level {i+1}' for i in range(num_columns - 3)] + ['File Name', 'Number of Sheets', 'Total Rows']
    df.columns = column_names

    return df

In [3]:
# Path to the root folder
root_folder = '../DataLHR'

# Create the DataFrame
df = create_dataframe(root_folder)

# Display the DataFrame
print(df)


             Level 1                          Level 2  \
0      Data Losarang                             2024   
1      Data Losarang                             2024   
2      Data Losarang                             2024   
3      Data Losarang                             2024   
4      Data Losarang                             2023   
..               ...                              ...   
252  Data PLATO 2019                         Balai 08   
253  Data PLATO 2019                         Balai 08   
254  Data PLATO 2019                         Balai 08   
255  Data PLATO 2019  Balai Papua Barat (Bukan Plato)   
256  Data PLATO 2019  Balai Papua Barat (Bukan Plato)   

                                  Level 3                       Level 4  \
0                      04 april_2024.xlsx                            30   
1                    01 januari_2024.xlsx                            31   
2                   02 februari_2024.xlsx                            29   
3              

In [4]:
df.head(30)

Unnamed: 0,Level 1,Level 2,Level 3,Level 4,File Name,Number of Sheets,Total Rows
0,Data Losarang,2024,04 april_2024.xlsx,30,2880,,
1,Data Losarang,2024,01 januari_2024.xlsx,31,2974,,
2,Data Losarang,2024,02 februari_2024.xlsx,29,2784,,
3,Data Losarang,2024,03 maret_2024.xlsx,31,2976,,
4,Data Losarang,2023,03 Maret.xlsx,14,1435,,
5,Data Losarang,2023,07 Juli.xlsx,31,3321,,
6,Data Losarang,2023,09 September.xlsx,30,2697,,
7,Data Losarang,2023,08 Agustus.xlsx,31,3296,,
8,Data Losarang,2023,10 Oktober.xlsx,31,2084,,
9,Data Losarang,2023,05 Mei.xlsx,30,2943,,


In [5]:
# determining the name of the file
output_file = '../Preprocessing/datafiles.xlsx'

# saving the excel
df.to_excel(output_file, index=False)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.
