In [35]:
from openpyxl import Workbook, load_workbook
import shutil
import os
import zipfile
import mimetypes
import pathlib

def copy_sheet(source_sheet, target_sheet):
    for row in source_sheet.iter_rows(values_only=True):
        target_sheet.append(row)

def extract_images_from_excel(path, output_folder_name='extracted_images_sample4'):
    """
    Extracts images from an Excel file and stores them in a single folder.

    Args:
        path (pathlib.Path or str): Excel file path.
        output_folder_name (str): Name of the folder to store the extracted images.
            Defaults to 'extracted_images_again'.

    Returns:
        new_paths (list[pathlib.Path]): List of paths to extracted images.
    """
    # Convert path to pathlib.Path if it's a string
    if isinstance(path, str):
        path = pathlib.Path(path)

    # Check if the file has the '.xlsx' extension
    if path.suffix != '.xlsx':
        raise ValueError('Path must be an xlsx file')

    # Extract the filename (excluding the extension) using .stem
    name = path.stem

    # Create a new folder for the extracted images
    output_folder = path.parent / output_folder_name
    output_folder.mkdir(exist_ok=True)  

    # Create a temporary directory for unzipping the Excel file
    temp_dir = path.parent / 'temp'
    temp_dir.mkdir(exist_ok=True) 

    try:
        # Unzip the Excel file into the temporary directory
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)

        # Locate the 'media' directory within the unzipped content
        media_dir = temp_dir / 'xl' / 'media'

        image_index = 0  # Initialize an index for the images
        new_paths = []  # List to store the paths of the extracted images

        # Iterate through the files in the 'media' directory
        for root, dirs, files in os.walk(media_dir):
            for file in files:
                # Determine the MIME type of the file
                mime_type, encoding = mimetypes.guess_type(file)

                # Check if the file is an image based on MIME type and file extension
                if mime_type and mime_type.startswith('image') and file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_index += 1  # Increment the image index for each image found

                    # Construct paths for the original image and the new destination
                    image_path = pathlib.Path(root) / file
                    new_path = output_folder / f'{name}-{str(image_index)}.png'

                    # Copy the image to the output folder with a new name
                    shutil.copy(image_path, new_path)

                    # Store the new path in the list
                    new_paths.append(new_path)

    finally:
        # Cleanup: Remove the temporary directory
        shutil.rmtree(temp_dir)

    # Return the list of paths to the extracted images
    return new_paths

def merge_and_extract_images(workbook_files, output_filename, output_folder_name='extracted_images'):
    """
    Merges multiple Excel workbooks while extracting images and storing them in a folder.

    Args:
        workbook_files (list[str]): List of paths to Excel workbook files.
        output_filename (str): Name of the merged Excel workbook.
        output_folder_name (str): Name of the folder to store the extracted images.
            Defaults to 'extracted_images'.

    Returns:
        None
    """
    merged_workbook = Workbook()  # Create a new workbook to merge all data into
    existing_sheets = {}  # Dictionary to keep track of sheet names already added
    new_paths = []  # List to store the paths of the extracted images

    for file in workbook_files:
        # Load each workbook
        wb = load_workbook(file, data_only=True)

        for sheet_name in wb.sheetnames:
            # Modify the sheet name if it already exists in the merged workbook
            merged_sheet_name = sheet_name
            suffix = 1
            while merged_sheet_name in existing_sheets:
                merged_sheet_name = f"{sheet_name} ({suffix})"
                suffix += 1

            # Create a new sheet in the merged workbook
            merged_sheet = merged_workbook.create_sheet(title=merged_sheet_name)

            # Copy data from the sheet in the current workbook to the merged workbook
            source_sheet = wb[sheet_name]
            copy_sheet(source_sheet, merged_sheet)

            # Extract images from the current sheet and store them in the output folder
            extracted_images = extract_images_from_excel(file, output_folder_name)
            new_paths.extend(extracted_images)

            # Add merged sheet name to existing_sheets dictionary
            existing_sheets[merged_sheet_name] = True

    # Save the merged workbook
    merged_workbook.save(output_filename)

    # Return the list of paths to the extracted images
    return new_paths


# Example usage
workbook_files = ["C:/Users/shres/Downloads/MNS BARTRACK NY.xlsx", "C:/Users/shres/Downloads/MNS BARTRACK NJ.xlsx", "C:/Users/shres/Downloads/MNS BARTRACK CT.xlsx"]
output_filename = 'Merged_Bartracks7.xlsx'
output_folder_name = 'extracted_images2'
extracted_image_paths = merge_and_extract_images(workbook_files, output_filename, output_folder_name)



In [37]:
import pandas as pd
import numpy as np


def standardize_excel_data(file_path):
    # Read the Excel file
    xls = pd.ExcelFile(file_path)
    
    # Initialize an empty list to store dataframes
    dfs = []
    
    # Iterate through each sheet
    for sheet_name in xls.sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(file_path, header=None, sheet_name=sheet_name)
        
        # Check if "Group Name:" exists in the top-left cells
        if "Group Name:" in df.iloc[:5, :5].values:
            df_transposed = df.transpose()

            # Set the first row as headers
            df_transposed.columns = df_transposed.iloc[0]
            df_transposed = df_transposed[1:]

            # Remove empty rows and columns
            df_transposed = df_transposed.dropna(how="all")
            df_transposed = df_transposed.dropna(axis=1, how="all")

            # Transpose back to original orientation
            df = df_transposed.transpose()
            df_transposed = df.transpose()
            df = df_transposed
            
            # Find the index where "MNS" starts
            mns_index = sheet_name.find("MNS")

            if mns_index != -1:  # If "MNS" is found
                group_name = sheet_name[:mns_index].strip()  # Extract the part before "MNS"
                mns_id = sheet_name[mns_index:].strip()  # Extract the part starting with "MNS"
            else:
                group_name = sheet_name.strip()  # If "MNS" is not found, treat the entire string as group name
                mns_id = ""  # Set empty string for MNS ID
            
            
            
            df['Group']=group_name
            df['MNS Plan ID']=mns_id

            # Fill missing values in specific columns
            df['Group Name:'].fillna(df['Group Name:'].iloc[0], inplace=True)
            df['Group Number(or TBD)'].fillna(df['Group Number(or TBD)'].iloc[0], inplace=True)
            df['Request Date:'].fillna(df['Request Date:'].iloc[0], inplace=True)
            df['Effective Date or NSB:'].fillna(df['Effective Date or NSB:'].iloc[0], inplace=True)

            df.rename(columns={np.NaN: 'Benefits Status'}, inplace=True)

            # Fill missing values in 'Benefits Status' column with forward fill
            df['Benefits Status'].fillna(method='ffill', inplace=True)
            
            # Rename 'General Information' column to 'Plan Information'
            df.rename(columns={'General Information': 'Plan Information'}, inplace=True)

            # Set display option to show all columns
            pd.set_option('display.max_columns', None)

            standardized_df = df

            dfs.append(standardized_df)
            
    if dfs:  # Check if there are any dataframes in the list
        # Concatenate all dataframes into one
        concatenated_df = pd.concat(dfs, ignore_index=True)
        return concatenated_df
    else:
        print("No sheet found with 'Group Name:' in the top-left cells.")
        return None



# Example usage:
file_path = r"C:\Users\shres\Downloads\MNS BARTRACK NJ.xlsx"
final_df = standardize_excel_data(file_path)
final_df

  if "Group Name:" in df.iloc[:5, :5].values:
  if "Group Name:" in df.iloc[:5, :5].values:
  if "Group Name:" in df.iloc[:5, :5].values:


Unnamed: 0,Group Name:,Group Number(or TBD),Request Date:,Effective Date or NSB:,Benefits Status,Plan Information,CSP,Base (Similar standard plan) Tracking ID,No of Employees Enrolled:,"Group State (NY,NJ,CT)",Market (Large or Small),Product,Access,Network,PCP/Specialist OV Copay,ER Cost Share,Hospital Cost Share,IN Deductible,IN Coinsurance %,IN Coinsurance Limit,IN Out of Pocket Maximum,OON Deductible,OON Coinsurance %,OON Coinsurance Limit,OON Out of Pocket Maximum,Group,MNS Plan ID
0,Intel Inc,25466,2020-04-03,2020-05-03,Benefits Currently in Place,Current Plan 1: P0522656 BUY UP,Freedom HMO 2353,,216,NJ,Large,HMO,Non Gated,Freedom,35/45,150.0,IP: 0 after deductible OP: 0 after deductible,2505/5000,100,3000/6000,3000/6001,2500/5002,50,32000/50002,32000/50002,Intel,MNS011
1,Intel Inc,25466,2020-04-03,2020-05-03,Benefits Currently in Place,Current Plan 2: P01456565 BUY UP,Freedom HMO 2328,,558,NJ,Large,HMO,Non Gated,Freedom,30/50,160.0,IP: 0 after deductible OP: 0 after deductible,2600/5000,90,32000/50000,32000/50000,2500/5003,52,32000/50003,32000/50003,Intel,MNS011
2,Intel Inc,25466,2020-04-03,2020-05-03,Benefits Currently in Place,Current Plan 3: P0145445 BUY UP,Freedom HMO 2789,,240,NJ,Large,HMO,Non Gated,Freedom,0/0,,IP: 0 after deductible OP: 0 after deductible,2800/5001,91,32000/50001,32000/50001,2500/5004,53,32000/50004,32000/50004,Intel,MNS011
3,Intel Inc,25466,2020-04-03,2020-05-03,Requested Benefits,Current Plan 1: P0145365 BUY UP,Freedom HMO 2353,KACTOHP77- MNS000011,600,NJ,Large,HMO,Non Gated,Freedom,30/55,,IP: 0 after deductible OP: 0 after deductible,2500/5002,92,32000/50002,32000/50002,2500/5002,54,32000/50002,32000/50002,Intel,MNS011
4,Intel Inc,25466,2020-04-03,2020-05-03,Requested Benefits,Current Plan 2: P0145488 BUY UP,Freedom HMO 2328,KACTOHP77- MNS000012,500,NJ,Large,HMO,Non Gated,Freedom,30/60,,IP: 0 after deductible OP: 0 after deductible,2500/5003,93,32000/50003,32000/50003,2500/5003,55,32000/50003,32000/50003,Intel,MNS011
5,Intel Inc,25466,2020-04-03,2020-05-03,Requested Benefits,Current Plan 3: P0145898 BUY UP,Freedom HMO 2789,KACTOHP77- MNS000013,250,NJ,Large,HMO,Non Gated,Freedom,0/1,,IP: 0 after deductible OP: 0 after deductible,2500/5004,94,32000/50004,32000/50004,2500/5004,56,32000/50004,32000/50004,Intel,MNS011
6,Apple Inc,1254689,2020-04-02,2020-01-03,Benefits Currently in Place,Current Plan 1: P0145445 BUY UP,Freedom HMO 2323,,23,NJ,Large,HMO,Non Gated,Freedom,30/45,150.0,IP: 0 after deductible OP: 0 after deductible,2500/5000,100,3000/6000,3000/6001,2500/5002,92,32000/50002,32000/50002,Apple Inc.,MNS025
7,Apple Inc,1254689,2020-04-02,2020-01-03,Benefits Currently in Place,Current Plan 2: P0145445 BUY UP,Freedom HMO 2324,,25,NJ,Large,HMO,Non Gated,Freedom,30/50,160.0,IP: 0 after deductible OP: 0 after deductible,2500/5000,90,32000/50000,32000/50000,2500/5003,93,32000/50003,32000/50003,Apple Inc.,MNS025
8,Apple Inc,1254689,2020-04-02,2020-01-03,Benefits Currently in Place,Current Plan 3: P0145445 BUY UP,Freedom HMO 2325,,45,NJ,Large,HMO,Non Gated,Freedom,0/0,,IP: 0 after deductible OP: 0 after deductible,2500/5001,91,32000/50001,32000/50001,2500/5004,94,32000/50004,32000/50004,Apple Inc.,MNS025
9,Apple Inc,1254689,2020-04-02,2020-01-03,Requested Benefits,Current Plan 1: P0145445 BUY UP,Freedom HMO 2326,KACTOHP58- MNS000025,36,NJ,Large,HMO,Non Gated,Freedom,30/55,,IP: 0 after deductible OP: 0 after deductible,2500/5002,92,32000/50002,32000/50002,2500/5002,92,32000/50002,32000/50002,Apple Inc.,MNS025
