In [7]:
pip install xlsxwriter


Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0
Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm  # Import tqdm for the progress bar
import xlsxwriter

def compile_sales_files(input_folder, output_file, header_mapping):
    """
    Compile multiple sales files into a single file with standardized headers.

    Parameters:
        input_folder (str): Folder containing monthly sales files.
        output_file (str): Path to save the compiled output file.
        header_mapping (dict): Mapping of varying headers to standardized headers.
    """
    all_data = []  # List to hold the DataFrames

    # Iterate over all files in the input folder
    for file in os.listdir(input_folder):
        if file.endswith('.csv') or file.endswith('.xlsx'):
            file_path = os.path.join(input_folder, file)

            # Read file (handle both CSV & XLSX)
            if file.endswith('.csv'):
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path)

            # Add missing columns with NaN (blank) values if they don't exist in the file
            for col in header_mapping.values():
                if col not in df.columns:
                    df[col] = np.nan

            # Standardize headers using mapping
            df = df.rename(columns=header_mapping)

            # Ensure column names are unique
            df.columns = pd.Series(df.columns).duplicated().apply(
                lambda x: f"_dup{x}" if x else "").radd(df.columns)

            # Keep only relevant standardized columns
            df = df[[col for col in header_mapping.values() if col in df.columns]]

            # Replace infinite values with NaN
            df = df.replace([np.inf, -np.inf], np.nan)

            # Fill NaN values with an empty string for Excel compatibility
            df = df.fillna('')

            # Append to the list
            all_data.append(df)
            print(f"Processed: {file}")

    # Combine all dataframes into a single dataframe
    combined_df = pd.concat(all_data, ignore_index=True)

    # Save the combined data to an output file with progress
    print("\nSaving the output file with progress...")
    
    # Create workbook with nan_inf_to_errors option
    workbook = xlsxwriter.Workbook(output_file, {'nan_inf_to_errors': True})
    worksheet = workbook.add_worksheet('Sales')

    # Write headers
    for col_num, col_name in enumerate(combined_df.columns):
        worksheet.write(0, col_num, col_name)

    # Write data rows
    for row_num, row in enumerate(combined_df.values, start=1):
        for col_num, value in enumerate(row):
            worksheet.write(row_num, col_num, value)

    # Autofit columns
    for i, col in enumerate(combined_df.columns):
        # Calculate the max length of the column content
        max_len = max(
            combined_df[col].astype(str).map(len).max(),  # Max length of column content
            len(str(col))  # Length of column header
        )
        # Set column width with a little extra space
        worksheet.set_column(i, i, max_len + 2)

    # Close the workbook
    workbook.close()

    print(f"\nCompiled file saved at: {output_file}")


# Example usage remains the same
if __name__ == "__main__":
    # Folder containing the sales files
    input_folder = r"D:\Shikha\Data\Baadshah_sales files\2024-2025"  # Replace with your folder path

    # Output file path
    output_file = r"D:\Shikha\Data\Baadshah_sales files\compiled_sales_data(2024-2025).xlsx"

    # Define header mapping (your existing mapping)
    header_mapping = {
        "Billing Date":"Billing Date",
        "Terr Region":"Terr Region",
        "Region":"Region",
        "Rep Month":"Rep Month",
        "Zone":"Zone",
        "Region":"Region",
        "Area":"Area",
        "Customer":"Customer",
        "Primary SAP IDs inlcuding Secondary":"Primary SAP IDs inlcuding Secondary",
        "SAP ID TYPE":"SAP ID TYPE",
        "Active Or inactive":"Active Or inactive",
        "Account Name":"Account Name",
        "Account Group":"Account Group",
        "Account grp type":"Account grp type",
        "Classification":"Classification",
        "classifact name":"classifact name",
        "Bill Type":"Bill Type",
        "Material":"Material",
        "SKU":"SKU",
        "Truck Bias":"Truck Bias",
        "Truck Rdl":"Truck Rdl"
    }

    # Print example setup details
    print("--- Compiling Sales Files ---")
    print(f"Input Folder: {input_folder}")
    print(f"Output File: {output_file}\n")

    # Run the function
    compile_sales_files(input_folder, output_file, header_mapping)

--- Compiling Sales Files ---
Input Folder: D:\Shikha\Data\Baadshah_sales files\2024-2025
Output File: D:\Shikha\Data\Baadshah_sales files\compiled_sales_data(2024-2025).xlsx

Processed: Baadshah Sales Dump Sept'24.xlsx
Processed: Baadshah Sales Dump- April'24.xlsx
Processed: Baadshah Sales Dump- Aug'24.xlsx
Processed: Baadshah Sales Dump- July'24.xlsx
Processed: Baadshah Sales Dump- June'24.xlsx
Processed: Baadshah Sales Dump- May'24.xlsx
Processed: Sales Dump Nov'24.xlsx
Processed: Sales Dump Oct'24.xlsx

Saving the output file with progress...

Compiled file saved at: D:\Shikha\Data\Baadshah_sales files\compiled_sales_data(2024-2025).xlsx
