In [1]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re
import numpy as np

In [None]:
rate = pd.read_excel(r"path\to\rate_card.xlsx", engine='openpyxl')
current_dir = os.getcwd()
group_data = pd.DataFrame()


# FedEx-O warehouse names are determined by shipping zip codes.
orsd_location_df = pd.read_excel(r"path\to\warehouse_zip.xlsx", sheet_name="邮编")
zip_to_location_dict = dict(zip(orsd_location_df['发货邮编'], orsd_location_df['Warehouse Location']))

In [None]:
def get_files_in_folder(folder_path):
    # Return list of paths to the Excel files.
    files = []
    for file in os.listdir(folder_path):
        if file.endswith('.xlsx') or file.endswith('.xls'):
            files.append(os.path.join(folder_path, file))
    return files

def count_sheets_in_file(file_path):
    # Return number of sheets in the file.
    try:
        excel_file = pd.ExcelFile(file_path)
        sheet_count = len(excel_file.sheet_names)
        return sheet_count
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0

        
def remove_special_characters(text):
    # Keep only Chinese characters and alphanumeric characters.
    text = text.strip()
    text = re.sub(r"[\n\r\t]", "", text)
    return re.sub(r'[^\u4e00-\u9fa5\w]+', '', text)

def convert_waybill_number(value):
    # Must have '\t' to successfully convert to str without scientific notation.
    if isinstance(value, (int, float)):
        return str(int(value)) + '\t'
    else:
        return str(value) + '\t'

    
def read_and_append(file_path):
    # Read an Excel file into a DataFrame and processe the 'Waybill Number' column.
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        df.fillna(0, inplace=True)
        df['Waybill Number'] = df['Waybill Number'].apply(convert_waybill_number)
        return df
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return pd.DataFrame()


def calculate_fee(df):
    # Calculate the fee based on the weight and zone.
    conditions = (df['计费重量(lbs)'] <= 150) & (df['Zone'] > 0)

    fee = np.where(conditions, rate.values[df['计费重量(lbs)']-1, df['Zone'] - 1], np.nan)
    
    return fee

def merge_files_to_sheet(current_dir, folder_path):

    # Each folder contains invoices for a month from a specific carrier.
    # Merge files within a folder to create a summary sheet named after the folder.
    # Calculate additional columns if necessary.

    folder_name = os.path.basename(folder_path)
    summary_file = os.path.join(current_dir, f"{folder_name}.csv")

    files = get_files_in_folder(folder_path)

    # Use multithreading to reduce runtime
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(read_and_append, file_path) for file_path in files]

        merged_df = pd.DataFrame()

        for future in futures:
            result = future.result()
            if not result.empty:
                merged_df = pd.concat([merged_df, result], ignore_index=False)
    
    if folder_name == 'FedexO':
        merged_df['计费重量(lbs)'] = np.ceil(merged_df['计费重量(lbs)']).astype(int)
        merged_df['Zone'] = merged_df['Zone'].astype(int)       
        merged_df['基础运费'] = calculate_fee(merged_df).astype(float)
        merged_df['应收金额'] = merged_df['基础运费'] + merged_df['总费用'].astype(float) - merged_df['运费'].astype(float)
        merged_df['基础运费'].fillna(merged_df['运费'], inplace=True)
        merged_df['应收金额'].fillna(merged_df['总费用'], inplace=True)
        merged_df['Warehouse Location'] = merged_df['发货邮编'].map(zip_to_location_dict)


    merged_df.to_csv(summary_file, index=False, encoding='utf-8-sig')




def split_files_by_customer_name(current_dir, folder_path):
    # Split the merged CSV file into separate CSV files based on customer names.
    
    folder_name = os.path.basename(folder_path)
    merged_file = os.path.join(current_dir, f"{folder_name}.csv")
    print(merged_file)

    df = pd.read_csv(merged_file)
    df['Waybill Number'] = df['Waybill Number'].apply(convert_waybill_number)
    for customer_name in df['Customer Name'].unique():
        if pd.isna(customer_name):
            print(df[df['Customer Name'].isna()])
            df = df.dropna(subset=['Customer Name'])
        customer_name_cleaned = remove_special_characters(customer_name)

        # Construct the path to the output folder for the customer using the cleaned customer name
        customer_folder = os.path.join(r'path\to\output_folder', customer_name_cleaned)
        # Create the output folder if it doesn't already exist
        if not os.path.exists(customer_folder):
            os.makedirs(customer_folder)

        # Create the filename for the customer's CSV file using the cleaned customer name and folder name
        customer_file = f"{customer_name_cleaned}_{folder_name}.csv"
        customer_path = os.path.join(customer_folder, customer_file)
        customer_df = df[df['Customer Name'] == customer_name]
        customer_df.to_csv(customer_path, index=False, encoding='utf-8-sig')

In [None]:
folders = [folder for folder in os.listdir() if os.path.isdir(folder)]


for folder in folders:
    folder_path = os.path.join(os.getcwd(), folder)
    print(folder)
    # files = get_files_in_folder(folder_path)
    # for file in files:
    #     sheet_count = count_sheets_in_file(file)
    #     print(file)
    #     print(sheet_count)   
    merge_files_to_sheet(current_dir, folder_path)
    split_files_by_customer_name(current_dir, folder_path)


In [None]:
# Only running a single folder

# folder_path = (r"path\to\single_folder")
# # files = get_files_in_folder(folder_path)）
# # for file in files:
# #     print(file)
# #     print(count_sheets_in_file(file))
# merge_files_to_sheet(current_dir, folder_path)
# split_files_by_customer_name(current_dir, folder_path)