In [18]:
import pandas as pd

# Function to load and clean a dataset dynamically
def load_and_clean_data(file_path):
    data = pd.read_excel(file_path, header=4)
    data = data.dropna(how='all').reset_index(drop=True)
    
    # Handle dynamic column renaming based on actual data
    expected_columns = [
        "Date", "Order_Number", "PO_Number", "Cost_Centre_Number",
        "Cost_Centre_Name_Delivery_Address", "NaN1", "Product",
        "Description", "Qty", "Process_Charge", "Domestic_Charge",
        "International_Charge", "Total_Charge", "No_of_Parcels",
        "Weight", "Cubic", "Con_Number", "Delivery_Courier"
    ]
    
    if data.shape[1] == len(expected_columns):
        data.columns = expected_columns
    else:
        raise ValueError(f"Unexpected number of columns in {file_path}")
    
    data_cleaned = data.drop(columns=["NaN1", "Description", "Cost_Centre_Number"])
    numeric_columns = ["Weight", "Cubic", "Process_Charge", "Domestic_Charge", "International_Charge", "Total_Charge", "No_of_Parcels"]
    for column in numeric_columns:
        data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce').fillna(0)
    return data_cleaned

# Load all datasets
file_paths = [
    '30247 January 2024.xlsx', '30404-February 2024.xlsx', '30536 - March 2024.xlsx',
    '30663- April 2024.xlsx', '30789 - May 2024.xlsx', '30910 - June 2024.xlsx'
]

all_data = pd.concat([load_and_clean_data(fp) for fp in file_paths])

# Sort by date
all_data['Date'] = pd.to_datetime(all_data['Date'], errors='coerce')
all_data = all_data.sort_values(by='Date').reset_index(drop=True)

# Function to concatenate matching order numbers and preserve all metadata correctly
def concatenate_and_preserve_all_metadata(df):
    result = []
    seen = set()
    
    for idx, row in df.iterrows():
        order_num = row['Order_Number']
        if order_num not in seen:
            matched_rows = df[df['Order_Number'] == order_num]
            
            # Convert all Product values to strings and concatenate
            products_series = matched_rows['Product'].dropna().astype(str).unique()
            if products_series.size > 0:
                products = ', '.join(products_series)
            else:
                filled_products = matched_rows['Product'].ffill().bfill().astype(str)
                products = filled_products.iloc[0] if not filled_products.empty else None
            
            total_qty = matched_rows['Qty'].sum()
            
            # Handle the Cost Centre Address
            if not matched_rows.empty:
                if 'DIG OPTIONS' in matched_rows['Cost_Centre_Name_Delivery_Address'].values:
                    cost_centre_address = matched_rows['Cost_Centre_Name_Delivery_Address'].iloc[-1]
                else:
                    cost_centre_address = matched_rows['Cost_Centre_Name_Delivery_Address'].iloc[0]
            else:
                cost_centre_address = None
            
            new_row = row.copy()
            new_row['Product'] = products
            new_row['Qty'] = total_qty
            new_row['Cost_Centre_Name_Delivery_Address'] = cost_centre_address
            
            try:
                new_row['Date'] = matched_rows['Date'].dropna().iloc[0] if not matched_rows['Date'].dropna().empty else None
                new_row['Weight'] = matched_rows['Weight'].sum()
                new_row['Cubic'] = matched_rows['Cubic'].sum()
                new_row['Con_Number'] = matched_rows['Con_Number'].dropna().iloc[0] if not matched_rows['Con_Number'].dropna().empty else None
                new_row['Delivery_Courier'] = matched_rows['Delivery_Courier'].dropna().iloc[0] if not matched_rows['Delivery_Courier'].dropna().empty else None
                new_row['Total_Charge'] = matched_rows['Total_Charge'].sum()
                new_row['Process_Charge'] = matched_rows['Process_Charge'].sum()
                new_row['Domestic_Charge'] = matched_rows['Domestic_Charge'].sum()
                new_row['International_Charge'] = matched_rows['International_Charge'].sum()
                new_row['No_of_Parcels'] = matched_rows['No_of_Parcels'].sum()
            except Exception as e:
                print(f"Error processing order number {order_num}: {e}")
                continue
            
            result.append(new_row)
            seen.add(order_num)
    
    return pd.DataFrame(result)




In [19]:
# Apply the function to concatenate matching orders and preserve all metadata
final_data = concatenate_and_preserve_all_metadata(all_data)

# Drop any remaining rows with invalid data in key columns
final_data = final_data.dropna(subset=['Order_Number', 'Cost_Centre_Name_Delivery_Address', 'Product', 'Qty'])

# Save the final combined and cleaned data to an Excel file
# final_data.to_excel('Combined_Cleaned_Data.xlsx', index=False)

# print("Data successfully combined and written to 'Combined_Cleaned_Data.xlsx'")

In [20]:
final_data

Unnamed: 0,Date,Order_Number,PO_Number,Cost_Centre_Name_Delivery_Address,Product,Qty,Process_Charge,Domestic_Charge,International_Charge,Total_Charge,No_of_Parcels,Weight,Cubic,Con_Number,Delivery_Courier
0,2024-01-02,1279009.0,TRAU46395/129528,DIG OPTIONS,"TR-GLOVE-09L, TR80-SHIFTER6",2.0,11.30,12.20,0.0,23.50,1.0,2.9,1.0,M7U5139827,eParcel
1,2024-01-02,1279603.0,TRAU46127/126297,DIG OPTIONS,TR80-TMSML3-BLK,1.0,8.45,13.84,0.0,22.29,1.0,5.0,1.0,3303211279603,Direct Freight Express
2,2024-01-02,1279600.0,TRAU45866/126625,DIG OPTIONS,TR80-TMSML3-BLK,1.0,8.45,23.67,0.0,32.12,1.0,5.0,1.0,3303211279600,Direct Freight Express
3,2024-01-02,1279594.0,TRAU45847/125353,DIG OPTIONS,TR80-TMSML3-BLK,1.0,8.45,49.94,0.0,58.39,1.0,5.0,1.0,3303211279594,Direct Freight Express
4,2024-01-02,1279593.0,TRAU45773/124646,DIG OPTIONS,TR80-TMSML3-BLK,1.0,8.45,19.73,0.0,28.18,1.0,5.0,1.0,3303211279593,Direct Freight Express
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4740,2024-06-28,1317140.0,TRAU49252-W6/160000,DIG OPTIONS,TR80-SHELF4-BLK,1.0,8.45,37.93,0.0,46.38,1.0,7.0,1.0,M7U5160840,eParcel
4741,2024-06-28,1317145.0,TRAU49254-W6/160016,DIG OPTIONS,TR80-SHELF4-BLK,1.0,8.45,24.62,0.0,33.07,1.0,7.0,1.0,M7U5160839,eParcel
4742,2024-06-28,1317144.0,TRAU49256-W6/160018,DIG OPTIONS,TR80-TMSML3-BLK,1.0,8.45,16.75,0.0,25.20,1.0,5.0,1.0,M7U5160838,eParcel
4743,2024-06-28,1317206.0,TRAU49265-W6/160045,DIG OPTIONS,TR80-FMMS5-BLK,1.0,8.45,14.78,0.0,23.23,1.0,15.0,1.0,3303211317206,Direct Freight Express


In [15]:
final_data['Product'].unique

<bound method Series.unique of 0                             TR-GLOVE-09L, TR80-SHIFTER6
1                                         TR80-TMSML3-BLK
2                                         TR80-TMSML3-BLK
3                                         TR80-TMSML3-BLK
4                                         TR80-TMSML3-BLK
                              ...                        
4740                                      TR80-SHELF4-BLK
4741                                      TR80-SHELF4-BLK
4742                                      TR80-TMSML3-BLK
4743                                       TR80-FMMS5-BLK
4744    TR-TVSSET, TR-SHAPLATE3, TR80-CABMAN2, TR-KBM4...
Name: Product, Length: 4745, dtype: object>

# Duplicate CHECK TRY 1

In [16]:
# Check for duplicates based on Order_Number
duplicates_before = all_data[all_data.duplicated(subset=['Order_Number'], keep=False)]

# Display the duplicates
if not duplicates_before.empty:
    print("Duplicates found before processing:")
    print(duplicates_before)
else:
    print("No duplicates found before processing.")


Duplicates found before processing:
            Date  Order_Number         PO_Number  \
0     2024-01-02     1279009.0  TRAU46395/129528   
1     2024-01-02     1279603.0  TRAU46127/126297   
2     2024-01-02     1279600.0  TRAU45866/126625   
3     2024-01-02     1279594.0  TRAU45847/125353   
4     2024-01-02     1279593.0  TRAU45773/124646   
...          ...           ...               ...   
17300        NaT     1317327.0  TRAU49263/160089   
17301        NaT     1317327.0  TRAU49263/160089   
17302        NaT     1317327.0  TRAU49263/160089   
17303        NaT     1317327.0  TRAU49263/160089   
17304        NaT     1317327.0  TRAU49263/160089   

                       Cost_Centre_Name_Delivery_Address       Product  Qty  \
0      Attn: JOHNNY TRAN Phone: +61422261645, ,13 OXF...           NaN  NaN   
1      Attn: SCOTT JAMES Phone: 0432 425 250, SCOTT J...           NaN  NaN   
2      Attn: JACINTA JONES Phone: 0408 138 127, JACIN...           NaN  NaN   
3      Attn: JADE MCGUI

In [17]:
# Check for duplicates based on Order_Number after processing
duplicates_after = final_data[final_data.duplicated(subset=['Order_Number'], keep=False)]

# Display the duplicates
if not duplicates_after.empty:
    print("Duplicates found after processing:")
    print(duplicates_after)
else:
    print("No duplicates found after processing.")


No duplicates found after processing.


In [18]:
# Check for exact duplicates across all columns
exact_duplicates = all_data[all_data.duplicated(keep=False)]

# Display the exact duplicates
if not exact_duplicates.empty:
    print("Exact duplicates found:")
    print(exact_duplicates)
else:
    print("No exact duplicates found.")


Exact duplicates found:
      Date  Order_Number            PO_Number  \
4745   NaT           NaN                  NaN   
4905   NaT     1279604.0     TRAU45815/126312   
4906   NaT     1279604.0     TRAU45815/126312   
5325   NaT     1280034.0     TRAU45585/119515   
5329   NaT     1280034.0     TRAU45585/119515   
...    ...           ...                  ...   
15386  NaT     1314254.0     TRAU48758/156419   
16829  NaT     1316635.0  TRAU49167-W6/159464   
16830  NaT     1316635.0  TRAU49167-W6/159464   
16949  NaT     1316729.0     TRAU48904/157727   
16958  NaT     1316729.0     TRAU48904/157727   

      Cost_Centre_Name_Delivery_Address          Product  Qty  Process_Charge  \
4745                                NaN              NaN  NaN             0.0   
4905                        DIG OPTIONS  TR80-TMSML3-BLK  1.0             0.0   
4906                        DIG OPTIONS  TR80-TMSML3-BLK  1.0             0.0   
5325                        DIG OPTIONS    TR80-SCREWNUT  1.0  

# Check 

In [21]:
import pandas as pd

# Assuming you have a list of file paths
file_paths = [
    '30247 January 2024.xlsx', '30404-February 2024.xlsx', '30536 - March 2024.xlsx',
    '30663- April 2024.xlsx', '30789 - May 2024.xlsx', '30910 - June 2024.xlsx'
]

# Load, clean, and append all data together
new_data = pd.concat([
    pd.read_excel(fp, header=4)
    .dropna(how='all')  # Drop rows where all elements are NaN
    .reset_index(drop=True)  # Reset index
    for fp in file_paths
])

# Drop rows where 'Product' column has missing values
new_data = new_data.dropna(subset=['Date']).reset_index(drop=True)



In [22]:
new_data.columns

Index(['Date', 'Order #', 'PO #', 'Cost Centre #',
       'Cost Centre Name/Delivery Address', 'Unnamed: 5', 'Product',
       'Description', 'Qty', 'Process Charge', 'Domestic', 'International',
       'Total Charge', 'No. of Parcels', 'Weight', 'Cubic', 'Con. Number',
       'Delivery Courier'],
      dtype='object')

In [23]:
# Find duplicates in the 'PO #' column
duplicate_order = new_data[new_data.duplicated(subset=['Order #'], keep=False)]

# Display the rows with duplicate 'PO #' values
duplicate_order


Unnamed: 0,Date,Order #,PO #,Cost Centre #,Cost Centre Name/Delivery Address,Unnamed: 5,Product,Description,Qty,Process Charge,Domestic,International,Total Charge,No. of Parcels,Weight,Cubic,Con. Number,Delivery Courier


### deneme

In [14]:
import pandas as pd

# Assuming you have a list of file paths
file_paths = [
    '30247 January 2024.xlsx', '30404-February 2024.xlsx', '30536 - March 2024.xlsx',
    '30663- April 2024.xlsx', '30789 - May 2024.xlsx', '30910 - June 2024.xlsx'
]

# Load, clean, and append all data together
product_cleaned = pd.concat([
    pd.read_excel(fp, header=4)
    .dropna(how='all')  # Drop rows where all elements are NaN
    .reset_index(drop=True)  # Reset index
    for fp in file_paths
])

# Drop rows where 'Product' column has missing values
product_cleaned = product_cleaned.dropna(subset=['Product']).reset_index(drop=True)

### Find exact duplicates

In [None]:
# Check for exact duplicates across all columns
exact_duplicates_try1 = all_data[all_data.duplicated(keep=False)]

# Display the exact duplicates
if not exact_duplicates_try1.empty:
    print("Exact duplicates found:")
    print(exact_duplicates_try1)
else:
    print("No exact duplicates found.")

In [15]:
import pandas as pd

# Assuming you have a list of file paths
file_paths = [
    '30247 January 2024.xlsx', '30404-February 2024.xlsx', '30536 - March 2024.xlsx',
    '30663- April 2024.xlsx', '30789 - May 2024.xlsx', '30910 - June 2024.xlsx'
]

# Load and append all data together
all_data = pd.concat([pd.read_excel(fp, header=4).dropna(how='all').reset_index(drop=True) for fp in file_paths])

# Check for exact duplicates across all columns
exact_duplicates_try1 = all_data[all_data.duplicated(keep=False)]

# Display the exact duplicates
if not exact_duplicates_try1.empty:
    print("Exact duplicates found:")
    print(exact_duplicates_try1)
else:
    print("No exact duplicates found.")


Exact duplicates found:
     Date    Order #                 PO # Cost Centre #  \
0     NaT        NaN                  NaN           NaN   
262   NaT  1279604.0     TRAU45815/126312   DIG OPTIONS   
263   NaT  1279604.0     TRAU45815/126312   DIG OPTIONS   
798   NaT  1280034.0     TRAU45585/119515   DIG OPTIONS   
802   NaT  1280034.0     TRAU45585/119515   DIG OPTIONS   
...   ...        ...                  ...           ...   
552   NaT  1314254.0     TRAU48758/156419   DIG OPTIONS   
2436  NaT  1316635.0  TRAU49167-W6/159464   DIG OPTIONS   
2437  NaT  1316635.0  TRAU49167-W6/159464   DIG OPTIONS   
2586  NaT  1316729.0     TRAU48904/157727   DIG OPTIONS   
2595  NaT  1316729.0     TRAU48904/157727   DIG OPTIONS   

     Cost Centre Name/Delivery Address  Unnamed: 5          Product  \
0                                  NaN         NaN              NaN   
262                        DIG OPTIONS         NaN  TR80-TMSML3-BLK   
263                        DIG OPTIONS         NaN  TR

In [17]:
# Assuming exact_duplicates_try1 is the DataFrame you want to save
output_file_path = 'exact_duplicates_try1.xlsx'

# Write the DataFrame to an Excel file
exact_duplicates_try1.to_excel(output_file_path, index=False)

# index=False ensures that the DataFrame index is not written to the Excel file