In [None]:
import pandas as pd
import pdfplumber

# 1. Read and group by ISIN in Excel
excel_file = 'your_isin_file.xlsx'
df_excel = pd.read_excel(excel_file)

# Group by ISIN and sum the amount
df_grouped = df_excel.groupby('ISIN')['Amount'].sum().reset_index()

# 2. Extract amount from PDF (assuming a table structure)
pdf_file = 'your_client_pdf.pdf'
with pdfplumber.open(pdf_file) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        # Assuming ISIN and amount are in the first and second columns respectively
        pdf_data = pd.DataFrame(table[1:], columns=table[0])  # convert to DataFrame for easier manipulation
        # Find the row where ISIN matches
        pdf_isin_row = pdf_data[pdf_data['Fund ISIN'] == 'Your ISIN']

# Get the relevant amount from PDF row
pdf_isin_amount = pdf_isin_row['Amount'].astype(float).sum()

# 3. Create comparison DataFrame
comparison_df = pd.DataFrame({
    'ISIN': df_grouped['ISIN'],
    'Excel Amount': df_grouped['Amount'],
    'PDF Amount': pdf_isin_amount
})

# Calculate the difference and percentage difference
comparison_df['Difference'] = comparison_df['Excel Amount'] - comparison_df['PDF Amount']
comparison_df['% Difference'] = (comparison_df['Difference'] / comparison_df['Excel Amount']) * 100

print(comparison_df)


In [None]:
import pandas as pd
import re

# Load the Excel file
excel_file = 'your_isin_file_with_multiple_sheets.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)  # Load all sheets into a dictionary

# Initialize lists to store the extracted data
isins = []
trader_names_list = []
usd_diffs = []
percent_diffs = []
sheet_names = []

# Function to clean sheet names by removing numbers in brackets
def clean_sheet_name(sheet_name):
    return re.sub(r"\s*\(.*?\)", "", sheet_name)

# Process each sheet in the Excel file
for sheet_name, df in all_sheets.items():
    
    # Only process if the sheet has entries (non-empty DataFrame)
    if len(df) > 0:
        # Variables to hold current ISIN and trader names for each sheet
        current_isin = None
        trader_names = []
        
        # Loop through the DataFrame to group ISINs and extract necessary data
        for idx, row in df.iterrows():
            isin = row.get('ISIN')
            trader = row.get('Trader name')
            
            # If ISIN is not NaN, this is part of the ISIN group
            if pd.notna(isin):
                if current_isin is None:  # Start a new ISIN group
                    current_isin = isin
                trader_names.append(trader)
            
            # If ISIN and Trader are NaN, indicating the blank row with difference data
            elif pd.isna(isin) and pd.isna(trader):
                if current_isin:  # Finalize the group for the current ISIN
                    # Append the data for the ISIN group
                    isins.append(current_isin)
                    trader_names_list.append(', '.join(trader_names))  # Concatenate trader names
                    usd_diffs.append(row['Diff'])  # Extract USD Difference
                    percent_diffs.append(row['%diff'])  # Extract % Difference
                    sheet_names.append(clean_sheet_name(sheet_name))  # Add cleaned sheet name
                    
                    # Reset for the next ISIN group
                    current_isin = None
                    trader_names = []

# Create a new DataFrame with the extracted data, including the sheet name
result_df = pd.DataFrame({
    'ISIN': isins,
    'Trader Names': trader_names_list,
    'USD Difference': usd_diffs,
    '% Difference': percent_diffs,
    'Sheet Name': sheet_names
})

# Save the result to a new Excel file or display it
result_df.to_excel('processed_isin_data_all_sheets.xlsx', index=False)
print(result_df)