In [3]:
import pandas as pd
import glob


In [42]:
def process_and_concat_sheets(file_path,monthname):
    # Load the Excel file
    all_data = []
    try:
        # Read the sheet
        file = pd.read_excel(file_path, sheet_name='Report mpay (success)')
        file.columns = file.iloc[0]
        file = file.iloc[1:].reset_index(drop=True)
        # print('file',file)
        file['Loan No.'] = file.iloc[:, 10]  # 11th column (index 10)
        file['Payment'] = file.iloc[:, 5]   # 6th column (index 5)
        file['Date'] = file.iloc[:, 3]
       

        # Convert data types
        file['Loan No.'] = file['Loan No.'].astype(str)
        file['Date'] = file['Date'].astype(str)
        # Add 'Type' column
        file['Type'] = 'Mpay'
        file['Month'] = monthname

        # Keep only relevant columns
        file = file[['Loan No.', 'Payment', 'Type', 'Date','Month']]
        # Append to list
        all_data.append(file)
        # print(f'Successfully processed: {sheet_name}')

    except Exception as e:
        print(f"Skipping sheet {file_path}  due to error: {e}")
    
    # Concatenate all DataFrames if there are any valid ones
    if all_data:
        concatenated_data = pd.concat(all_data, ignore_index=True)
    else:
        concatenated_data = pd.DataFrame(columns=['Loan No.', 'Payment', 'Type', 'Date','Month'])

    return concatenated_data


In [43]:

file_list = glob.glob(f"./file/M-pay-12-2024/*")
# file_list = glob.glob(f"./file/*")


# List to store concatenated data from each file
all_files_data = []

# Loop through each file and process it
for file_path in file_list:
    print(f"Processing file: {file_path}")
    
    # Process the sheets in each file and concatenate the results
    file_data = process_and_concat_sheets(file_path,'Dec 24')
    
    # Append the data to the list
    all_files_data.append(file_data)

# Concatenate all file data into a single DataFrame
final_data = pd.concat(all_files_data, ignore_index=True)
final_data =final_data.dropna(subset=['Payment'])
final_data = final_data.loc[final_data['Loan No.'] != 'nan']
print(final_data)

Processing file: ./file/M-pay-12-2024\Report Mpay 01-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 02-03-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 03-04-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 04-05-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 05-06-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 06-09-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 09-11-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 11-12-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 12-13-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 13-16-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 16-17-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 17-18-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 18-19-12-2024.xlsx
Processing file: ./file/M-pay-12-2024\Report Mpay 19-20-12-2024.xlsx
Processing file: ./file/M-pay-12-2024

In [44]:
final_data

Unnamed: 0,Loan No.,Payment,Type,Date,Month
14,Ref no.1,Product Amount (Baht),Mpay,Payment Date/Time,Dec 24
15,7240870103502870,1059.51,Mpay,2024-12-01 00:00:01,Dec 24
16,7240970103693882,616.8,Mpay,2024-12-01 00:00:14,Dec 24
17,7240470102488705,322.07,Mpay,2024-12-01 00:00:17,Dec 24
18,7240570102758669,706.17,Mpay,2024-12-01 00:00:29,Dec 24
...,...,...,...,...,...
412771,7240970103780051,925.23,Mpay,2024-12-31 23:58:09,Dec 24
412772,7240870103493488,925.23,Mpay,2024-12-31 23:58:55,Dec 24
412773,7240470102630880,706.17,Mpay,2024-12-31 23:59:08,Dec 24
412774,7240370102389882,187.24,Mpay,2024-12-31 23:59:46,Dec 24


In [45]:
final_data.to_excel('./output/mpay1224.xlsx')


In [9]:
def split_and_save_data(data, output_file):
    # Split data into chunks of 500,000 rows if needed
    chunk_size = 500000
    num_chunks = (len(data) // chunk_size) + (1 if len(data) % chunk_size > 0 else 0)
    
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        # Write each chunk to a separate sheet
        for chunk_num in range(num_chunks):
            start_row = chunk_num * chunk_size
            end_row = min((chunk_num + 1) * chunk_size, len(data))
            chunk = data.iloc[start_row:end_row]
            
            # Save the chunk to a new sheet
            chunk.to_excel(writer, sheet_name=f'Sheet_{chunk_num + 1}', index=False)
            print(f"Saved Sheet_{chunk_num + 1} with rows {start_row} to {end_row}")

In [21]:
output_file = './output/0225.xlsx'
if len(final_data) > 500000:
        print(f"Total rows {len(final_data)} exceed 500,000. Splitting into multiple sheets.")
        split_and_save_data(final_data, output_file)
else:
    # If it's less than 500,000 rows, save it as one sheet
    final_data.to_excel(output_file, index=False, sheet_name='Sheet_1')
    print(f"Saved all data to {output_file}.")

Total rows 1297134 exceed 500,000. Splitting into multiple sheets.
Saved Sheet_1 with rows 0 to 500000
Saved Sheet_2 with rows 500000 to 1000000
Saved Sheet_3 with rows 1000000 to 1297134
