In [None]:
import os
import pandas as pd

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            process_excel(file_path)

def process_excel(file_path):
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Field':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Field'], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    return result_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path

In [None]:
import os
import pandas as pd

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                result_df = process_excel(file_path)
                print(result_df)
            except KeyError:
                print(f"Ignoring file: {file_path} - KeyError occurred.")

def process_excel(file_path):
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Field':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Field'], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    return result_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path


In [None]:
import os
import pandas as pd
import openpyxl

def get_sheet_names(file_path):
    wb = openpyxl.load_workbook(file_path, read_only=True)
    sheet_names = wb.sheetnames
    wb.close()
    return sheet_names

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                sheet_names = get_sheet_names(file_path)
                for sheet_name in sheet_names:
                    result_df = process_excel(file_path, sheet_name)
                    print(result_df)
            except KeyError:
                print(f"Ignoring file: {file_path} - KeyError occurred.")

def process_excel(file_path, sheet_name):
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Field':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Field'], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    # Create a DataFrame with the sheet name as a row
    sheet_name_row = pd.DataFrame({"Name": ["Sheet Name", sheet_name]})
    
    # Concatenate the sheet name row with the processed DataFrame
    processed_df = pd.concat([sheet_name_row, result_df], ignore_index=True)
    
    return processed_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path


In [None]:
import os
import pandas as pd
import openpyxl

def get_sheet_names(file_path):
    try:
        wb = openpyxl.load_workbook(file_path, read_only=True)
        sheet_names = wb.sheetnames
        wb.close()
        return sheet_names
    except KeyError:
        print(f"KeyError occurred while accessing sheet names in the file: {file_path}")
        return []


def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                sheet_names = get_sheet_names(file_path)
                for sheet_name in sheet_names:
                    result_df = process_excel(file_path, sheet_name)
                    print(result_df)
            except KeyError:
                print(f"Ignoring file: {file_path} - KeyError occurred.")

def process_excel(file_path, sheet_name):
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Field':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Field'], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    # Create a DataFrame with the sheet name as a row
    sheet_name_row = pd.DataFrame({"Name": ["Sheet Name", sheet_name]})
    
    # Concatenate the sheet name row with the processed DataFrame
    processed_df = pd.concat([sheet_name_row, result_df], ignore_index=True)
    
    return processed_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path


In [None]:
import os
import pandas as pd
import openpyxl

def get_workbook_name(file_path):
    return os.path.basename(file_path)

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                result_df = process_excel(file_path)
                print(result_df)
            except KeyError:
                print(f"Ignoring file: {file_path} - KeyError occurred.")

def process_excel(file_path):
    workbook_name = get_workbook_name(file_path)
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Fields':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Fields', "Out of Network"], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    # Add a row at the beginning with workbook name
    result_df.loc[-1] = ['Name', workbook_name]
    result_df.index = result_df.index + 1
    result_df = result_df.sort_index()

    return result_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path


In [None]:
import os
import pandas as pd
import re

def get_workbook_names(file_path):
    workbook_name = os.path.basename(file_path).split('.')[0]
    mns_match = re.match(r'.*(MNS.*)', workbook_name)
    if mns_match:
        mns_id = mns_match.group(1)
        workbook_name = workbook_name.replace(mns_id, '')
        return workbook_name.strip(), mns_id.strip()
    else:
        return workbook_name.strip(), ''

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(folder_path, file_name)
            try:
                result_df = process_excel(file_path)
                print(result_df)
            except KeyError:
                print(f"Ignoring file: {file_path} - KeyError occurred.")

def process_excel(file_path):
    workbook_name, mns_id = get_workbook_names(file_path)
    pd.set_option('display.max_columns', None)
    df = pd.read_excel(file_path)

    # Drop columns with more than 94% null values
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > 94].index
    df = df.drop(columns=cols_to_drop)

    # Keep only specified phrases in the first column
    phrases_to_keep = ['Annual Medical Deductible', 'Individual', 'Family', 'Annual Out-of-Pocket Limit', 'Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']
    df = df[df.iloc[:, 0].isin(phrases_to_keep)]

    # Drop columns with all null values
    df = df.dropna(axis=1, how='all')

    # Rename columns
    new_col_names = {0: 'Field', 1: 'In Network', 2:'Out of Network'}
    for idx, col_name in new_col_names.items():
        if idx < len(df.columns):
            df = df.rename(columns={df.columns[idx]: col_name})

    # Split dataframe based on network type
    if 'Out of Network' not in df.columns:
        result_df = df.copy()
    else:
        df1 = pd.DataFrame({'Field':['In Network']})
        df1 = pd.concat([df1, df[['Field', 'In Network']]])

        df2 = pd.DataFrame({'Fields':['Out of Network']})
        df2 = pd.concat([df2, df[['Field', 'Out of Network']]])

        result_df = pd.concat([df1, df2], ignore_index=True)

    # Fill missing values and rename columns
    result_df_filled = result_df.iloc[:, ::-1].fillna(method='ffill', axis=1).iloc[:,::-1]
    result_df_filled = result_df_filled.drop(['Fields', 'Out of Network'], axis=1, errors='ignore')
    result_df_filled = result_df_filled.rename(columns={'In Network': 'Value'})

    # Update specific fields
    for i, row in result_df_filled.iterrows():
        if row['Field'] == 'Annual Medical Deductible':
            result_df_filled.at[i+1, 'Field'] = 'Individual Deductible'
            result_df_filled.at[i+2, 'Field'] = 'Family Deductible'
        elif row['Field'] == 'Annual Out-of-Pocket Limit':
            result_df_filled.at[i+1, 'Field'] = 'Individual OOP'
            result_df_filled.at[i+2, 'Field'] = 'Family OOP'

    result_df_filled = result_df_filled[~result_df_filled['Field'].isin(['Annual Medical Deductible', 'Annual Out-of-Pocket Limit'])]

    # Add 'Network Type' column
    result_df_filled.insert(0, 'Network Type', result_df_filled['Field'].apply(lambda x: 'In Network' if x == 'In Network' else 'Out of Network' if x == 'Out of Network' else ''))

    # Forward fill values until the next column contains certain keywords
    network_col_index = result_df_filled.columns.get_loc('Network Type')
    values_col_index = result_df_filled.columns.get_loc('Field')
    for i in range(len(result_df_filled)):
        if result_df_filled.iloc[i, network_col_index] in ['In Network', 'Out of Network']:
            for j in range(i, len(result_df_filled)):
                if result_df_filled.iloc[j, values_col_index] in ['Primary Care Physician', 'Specialist', 'Urgent Care Center Services', 'Emergency Care']:
                    break
                else:
                    result_df_filled.iloc[j, network_col_index] = result_df_filled.iloc[i, network_col_index]

    # Remove rows where values are repeated within the same row
    result_df_filled = result_df_filled[~result_df_filled.apply(lambda row: len(set(row.dropna())) != len(row.dropna()), axis=1)]

    # Merge first and second columns
    result_df_filled.iloc[:, 0] = result_df_filled.iloc[:, 0] + ' ' + result_df_filled.iloc[:, 1]

    # Drop the second column
    result_df_filled.drop(columns=[result_df_filled.columns[1]], inplace=True)

    # Update network type based on value content
    for i, row in result_df_filled.iterrows():
        if '%' in str(row['Value']):
            result_df_filled.at[i, 'Network Type'] = 'Coinsurance'

    coins_df = result_df_filled[result_df_filled['Network Type'] == 'Coinsurance']
    coins_df = coins_df.drop_duplicates()

    result_df = pd.concat([coins_df, result_df_filled[result_df_filled['Network Type'] != 'Coinsurance']])
    result_df.reset_index(drop=True, inplace=True)

    # Add rows at the beginning with workbook names
    result_df.loc[-1] = ['Name', workbook_name]
    result_df.loc[-2] = ['MNS ID', mns_id]
    result_df.index = result_df.index + 2
    result_df = result_df.sort_index()

    return result_df

# Example usage:
process_folder(r'C:\Users\shres\Downloads')  # Replace with your folder path
