Data gathering

In [None]:
import pandas as pd
import msoffcrypto
import io

def load_encrypted_excel(file_path: str, password: str) -> pd.DataFrame:
    if password:
        with open(file_path, 'rb') as f:
            office_file = msoffcrypto.OfficeFile(f)
            office_file.load_key(password=password)
            decrypted = io.BytesIO()
            office_file.decrypt(decrypted)
            decrypted.seek(0)
            return pd.read_excel(decrypted)
    else:
        # load unencrypted file directly
        return pd.read_excel(file_path)

# File configurations
files = [
    # Core Client & FNA Process Tables
    {"name": "client",                      "path": "client.xlsx",                      "password": "_XlN@a9)EVy1"},
    {"name": "provider",                    "path": "provider.xlsx",                    "password": "unT4d4GO#dX("},
    {"name": "emfc2fna",                    "path": "emfc2fna.xlsx",                    "password": "dQq9T%pC^?22"},
    {"name": "emfc2personalinformation",    "path": "emfc2personalinformation.xlsx",    "password": "ZqYmaFgC@Zv3"},
    {"name": "emfc2",                       "path": "emfc2.xlsx",                       "password": "79GYEd%l(2Bf"},
    {"name": "EMFC2Assets",                 "path": "EMFC2Assets.xlsx",                 "password": "!suNZ=%YA13k"},
    {"name": "emfc2portofolioinsurance",    "path": "emfc2portofolioinsurance.xlsx",    "password": "BcxM>wz*(hxF"},

    # Product & Solution Workflow
    {"name": "emfc2productsolution",        "path": "emfc2productsolution.xlsx",        "password": "@OFn7oA5!Joe"},

    # Product & Category Lookup Tables
    {"name": "ProductMainPlan",             "path": "ProductMainPlan.xlsx",             "password": ")XQ4ZDssowrA"},

    {"name": "ProductMainPlan_Labeled",     "path": "ProductMainPlan_Labeled.xlsx",     "password": None}
]

# Load all datasets into memory
datasets = {}

print("=== LOADING ALL DATASETS ===")
for file in files:
    print(f"{file['name']}...", end=" ")
    try:
        datasets[file['name']] = load_encrypted_excel(file["path"], file["password"])
        shape = datasets[file['name']].shape
        print(f"({shape[0]:,} rows, {shape[1]} columns)")
    except Exception as e:
        print(f"✗ Error: {e}")

print(f"\nSuccessfully loaded {len(datasets)} datasets")
print("Available datasets:", list(datasets.keys()))


=== LOADING ALL DATASETS ===
client... (45,688 rows, 49 columns)
provider... (128 rows, 21 columns)
emfc2fna... (51,772 rows, 31 columns)
emfc2personalinformation... (52,305 rows, 37 columns)
emfc2... (51,769 rows, 8 columns)
EMFC2Assets... (50,500 rows, 39 columns)
emfc2portofolioinsurance... (27,437 rows, 25 columns)
emfc2productsolution... (43,501 rows, 25 columns)
ProductMainPlan... (1,532 rows, 22 columns)
ProductMainPlan_Labeled... ✗ Error: No key specified

Successfully loaded 9 datasets
Available datasets: ['client', 'provider', 'emfc2fna', 'emfc2personalinformation', 'emfc2', 'EMFC2Assets', 'emfc2portofolioinsurance', 'emfc2productsolution', 'ProductMainPlan']


Specify column headers for each dataset

In [7]:
print("=== COLUMN HEADERS FOR EACH DATASET ===")
for name, df in datasets.items():
    print(f"📄 Dataset: {name}")
    print(f"🧾 Columns ({len(df.columns)}):")
    for col in df.columns:
        print(f"  - {col} ({df[col].dtype})")
    print("-" * 20)


=== COLUMN HEADERS FOR EACH DATASET ===
📄 Dataset: client
🧾 Columns (49):
  - # (int64)
  - ClientId (object)
  - ClientName (object)
  - ClientMobileNumber (object)
  - ClientMNVerified (bool)
  - ClientMNVeriCode (float64)
  - ClientMNVeriCodeTime (datetime64[ns])
  - ClientEmail (object)
  - ClientContactPreferences (object)
  - ClientGender (object)
  - ClientDOB (datetime64[ns])
  - ClientCPFContributionCategoryId (object)
  - IDNumber (object)
  - Nationality (object)
  - SpokenLanguage (object)
  - WrittenLanguage (object)
  - Education (object)
  - EmploymentStatus (object)
  - Occupation (object)
  - MaritalStatus (object)
  - PrimaryAddress (object)
  - CorrespondingAddress (object)
  - IncomeRange (object)
  - AccompaniedbyTrustedIndividual (float64)
  - ClientInvitedDate (datetime64[ns])
  - ClientStatus (object)
  - RiskProfile (object)
  - RiskProfileSubmissionDate (datetime64[ns])
  - CKAProfile (object)
  - CARProfile (object)
  - CKACARSubmissionDate (datetime64[ns])
 