In [7]:
import pandas as pd

In [10]:
import pandas as pd

# Function to load and clean a dataset dynamically
def load_and_clean_data(file_path):
    data = pd.read_excel(file_path, header=4)
    data = data.dropna(how='all').reset_index(drop=True)
    
    # Handle dynamic column renaming based on actual data
    expected_columns = [
        "Date", "Order_Number", "PO_Number", "Cost_Centre_Number",
        "Cost_Centre_Name_Delivery_Address", "NaN1", "Product",
        "Description", "Qty", "Process_Charge", "Domestic_Charge",
        "International_Charge", "Total_Charge", "No_of_Parcels",
        "Weight", "Cubic", "Con_Number", "Delivery_Courier"
    ]
    
    if data.shape[1] == len(expected_columns):
        data.columns = expected_columns
    else:
        raise ValueError(f"Unexpected number of columns in {file_path}")
    
    data_cleaned = data.drop(columns=["NaN1", "Description", "Cost_Centre_Number"])
    numeric_columns = ["Weight", "Cubic", "Process_Charge", "Domestic_Charge", "International_Charge", "Total_Charge", "No_of_Parcels"]
    for column in numeric_columns:
        data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce').fillna(0)
    return data_cleaned

# Load all datasets
file_paths = [
    '30247 January 2024.xlsx', '30404-February 2024.xlsx', '30536 - March 2024.xlsx',
    '30663- April 2024.xlsx', '30789 - May 2024.xlsx', '30910 - June 2024.xlsx'
]

all_data = pd.concat([load_and_clean_data(fp) for fp in file_paths])

# Sort by date
all_data['Date'] = pd.to_datetime(all_data['Date'], errors='coerce')
all_data = all_data.sort_values(by='Date').reset_index(drop=True)

# Function to concatenate matching order numbers and preserve all metadata correctly
def concatenate_and_preserve_all_metadata(df):
    result = []
    seen = set()
    
    for idx, row in df.iterrows():
        order_num = row['Order_Number']
        if order_num not in seen:
            matched_rows = df[df['Order_Number'] == order_num]
            products = ', '.join(matched_rows['Product'].dropna().unique())
            total_qty = matched_rows['Qty'].sum()
            
            if not matched_rows.empty:
                if 'DIG OPTIONS' in matched_rows['Cost_Centre_Name_Delivery_Address'].values:
                    cost_centre_address = matched_rows['Cost_Centre_Name_Delivery_Address'].iloc[-1]
                else:
                    cost_centre_address = matched_rows['Cost_Centre_Name_Delivery_Address'].iloc[0]
            else:
                cost_centre_address = None
            
            new_row = row.copy()
            new_row['Product'] = products
            new_row['Qty'] = total_qty
            new_row['Cost_Centre_Name_Delivery_Address'] = cost_centre_address
            
            try:
                new_row['Date'] = matched_rows['Date'].dropna().iloc[0] if not matched_rows['Date'].dropna().empty else None
                new_row['Weight'] = matched_rows['Weight'].sum()
                new_row['Cubic'] = matched_rows['Cubic'].sum()
                new_row['Con_Number'] = matched_rows['Con_Number'].dropna().iloc[0] if not matched_rows['Con_Number'].dropna().empty else None
                new_row['Delivery_Courier'] = matched_rows['Delivery_Courier'].dropna().iloc[0] if not matched_rows['Delivery_Courier'].dropna().empty else None
                new_row['Total_Charge'] = matched_rows['Total_Charge'].sum()
                new_row['Process_Charge'] = matched_rows['Process_Charge'].sum()
                new_row['Domestic_Charge'] = matched_rows['Domestic_Charge'].sum()
                new_row['International_Charge'] = matched_rows['International_Charge'].sum()
                new_row['No_of_Parcels'] = matched_rows['No_of_Parcels'].sum()
            except Exception as e:
                print(f"Error processing order number {order_num}: {e}")
                continue
            
            result.append(new_row)
            seen.add(order_num)
    
    return pd.DataFrame(result)

# Apply the function to concatenate matching orders and preserve all metadata
final_data = concatenate_and_preserve_all_metadata(all_data)

# Drop any remaining rows with invalid data in key columns
final_data = final_data.dropna(subset=['Order_Number', 'Cost_Centre_Name_Delivery_Address', 'Product', 'Qty'])

# Save the final combined and cleaned data to an Excel file
final_data.to_excel('Combined_Cleaned_Data.xlsx', index=False)

print("Data successfully combined and written to 'Combined_Cleaned_Data.xlsx'")


ValueError: Unexpected number of columns in 30663- April 2024.xlsx

In [5]:

print("Data successfully combined and written to 'Combined_Cleaned_Data.xlsx'")
