# Trial Balance Automation - MVP

**Purpose**: Load, validate, and analyze trial balance data

**Author**: Raiden Velarde Guillergan - Data Scientist 

**Date**: November 4, 2025

**Data Source**: `data/raw/Trial Balance/2025/September/`

## Workflow Diagram

```mermaid
flowchart TD
    Start([Start]) --> Init[1. Initialize<br/>Libraries & Logger]
    Init --> LoadFunc[2-3. Define<br/>Loading Functions]
    LoadFunc --> Load[4. Load Data<br/>TB + References]
    Load --> Separate[5. Separate Data]
    Separate --> AddDate[6. Add Date Column]
    AddDate --> Consolidate[7. Consolidate TB]
    Consolidate --> Pivot[8. Create Pivot Table]
    Pivot --> Match[9. Match GL Accounts]
    Match --> CheckNew{New Accounts?}
    CheckNew -->|Yes| Export[Export Updated COA]
    CheckNew -->|No| Done
    Export --> Done([End])
    
    style Start fill:#e1f5e1
    style Done fill:#ffe1e1
    style Pivot fill:#f0e1ff
    style Export fill:#e1f0ff
```

**Note**: Install `Markdown Preview Mermaid Support` extension to view diagrams.  
**Full Documentation**: See `docs/workflow-diagram.md`

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

print("‚úì Libraries imported successfully")

In [None]:
# Setup logging configuration
import logging

# Create logs directory if it doesn't exist
log_dir = Path('../logs')
log_dir.mkdir(parents=True, exist_ok=True)

# Create log filename with timestamp
log_filename = f"trial_balance_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
log_path = log_dir / log_filename

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()  # Also print to console
    ]
)

logger = logging.getLogger(__name__)

logger.info("="*60)
logger.info("TRIAL BALANCE AUTOMATION - LOGGING INITIALIZED")
logger.info("="*60)
logger.info(f"Log file: {log_path}")
logger.info(f"Session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info(f"Working directory: {Path.cwd()}")

print(f"\n‚úì Logging configured successfully")
print(f"üìù Log file: {log_path}")

## 1. Setup and Configuration

## 2. Data Loading Function

## 3. Reference Data Loading Function

In [None]:
def load_reference_data(base_path='../data/references'):
    """
    Load reference data (COA Mapping and Portfolio Mapping) from the latest files.
    Supports both CSV and XLSX file formats.
    
    Returns:
        dict: Dictionary containing:
            - 'coa_mapping': DataFrame from COA Mapping folder (latest file)
            - 'portfolio_mapping': DataFrame from Portfolio Mapping folder (latest file)
            - 'metadata': dict with loading information
    """
    
    base_path = Path(base_path)
    
    # Initialize result dictionary
    result = {
        'coa_mapping': None,
        'portfolio_mapping': None,
        'metadata': {
            'load_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'coa_mapping_file': None,
            'portfolio_mapping_file': None
        }
    }
    
    # Helper function to load file (CSV or XLSX)
    def load_file(file_path):
        if file_path.suffix.lower() == '.csv':
            return pd.read_csv(file_path)
        elif file_path.suffix.lower() in ['.xlsx', '.xls']:
            return pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")
    
    # Define folder paths
    coa_mapping_folder = base_path / 'COA Mapping'
    portfolio_mapping_folder = base_path / 'Portfolio Mapping'
    
    # ========== Load COA Mapping (Latest File) ==========
    if coa_mapping_folder.exists():
        print(f"üìÇ Loading COA Mapping from: {coa_mapping_folder}")
        
        # Get all CSV and XLSX files sorted by modification time (latest first)
        files = sorted(
            list(coa_mapping_folder.glob('*.csv')) + 
            list(coa_mapping_folder.glob('*.xlsx')) + 
            list(coa_mapping_folder.glob('*.xls')),
            key=lambda f: f.stat().st_mtime, 
            reverse=True
        )
        
        if not files:
            print(f"  ‚ö†Ô∏è  WARNING: No CSV or XLSX files found in {coa_mapping_folder}")
        else:
            latest_file = files[0]
            result['coa_mapping'] = load_file(latest_file)
            result['metadata']['coa_mapping_file'] = latest_file.name
            
            print(f"  ‚úì Loaded latest file: {latest_file.name}")
            print(f"    Records: {len(result['coa_mapping'])}")
            
            if len(files) > 1:
                print(f"    Note: {len(files)} files found, loaded the most recent")
    else:
        print(f"‚ö†Ô∏è  WARNING: COA Mapping folder not found: {coa_mapping_folder}")
    
    # ========== Load Portfolio Mapping (Latest File) ==========
    if portfolio_mapping_folder.exists():
        print(f"\nüìÇ Loading Portfolio Mapping from: {portfolio_mapping_folder}")
        
        # Get all CSV and XLSX files sorted by modification time (latest first)
        files = sorted(
            list(portfolio_mapping_folder.glob('*.csv')) + 
            list(portfolio_mapping_folder.glob('*.xlsx')) + 
            list(portfolio_mapping_folder.glob('*.xls')),
            key=lambda f: f.stat().st_mtime, 
            reverse=True
        )
        
        if not files:
            print(f"  ‚ö†Ô∏è  WARNING: No CSV or XLSX files found in {portfolio_mapping_folder}")
        else:
            latest_file = files[0]
            result['portfolio_mapping'] = load_file(latest_file)
            result['metadata']['portfolio_mapping_file'] = latest_file.name
            
            print(f"  ‚úì Loaded latest file: {latest_file.name}")
            print(f"    Records: {len(result['portfolio_mapping'])}")
            
            if len(files) > 1:
                print(f"    Note: {len(files)} files found, loaded the most recent")
    else:
        print(f"‚ö†Ô∏è  WARNING: Portfolio Mapping folder not found: {portfolio_mapping_folder}")
    
    return result

In [None]:
def load_trial_balance_data(base_path='../data/raw/Trial Balance'):
    """
    Load trial balance data dynamically based on the latest year and month folders.
    
    Returns:
        dict: Dictionary containing:
            - 'trial_balance': dict of DataFrames with date keys (from Trial Balance folder)
            - 'chart_of_accounts': DataFrame (from Chart of Accounts folder)
            - 'metadata': dict with loading information
    """
    
    base_path = Path(base_path)
    
    # Find the latest year folder (reverse sort to get latest first)
    year_folders = sorted((f for f in base_path.iterdir() if f.is_dir()), reverse=True)
    if not year_folders:
        raise ValueError(f"No year folders found in {base_path}")
    
    latest_year = year_folders[0]
    print(f"üìÖ Latest year folder: {latest_year.name}")
    
    # Find the latest month folder (reverse sort to get latest first)
    month_folders = sorted((f for f in latest_year.iterdir() if f.is_dir()), reverse=True)
    if not month_folders:
        raise ValueError(f"No month folders found in {latest_year}")
    
    latest_month = month_folders[0]
    print(f"üìÖ Latest month folder: {latest_month.name}")
    
    # Initialize result dictionary
    result = {
        'trial_balance': {},
        'chart_of_accounts': None,
        'metadata': {
            'year': latest_year.name,
            'month': latest_month.name,
            'load_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'tb_files': [],  # List of loaded Trial Balance files
            'coa_file': None  # Chart of Accounts file
        }
    }
    
    # Define folder paths
    tb_folder = latest_month / 'Trial Balance'
    coa_folder = latest_month / 'Chart of Accounts'
    
    # ========== Load Trial Balance Files ==========
    if tb_folder.exists():
        print(f"\nüìÇ Loading Trial Balance files from: {tb_folder}")
        
        csv_files = list(tb_folder.glob('*.csv'))
        non_compliant_files = []
        
        for file in csv_files:
            filename = file.stem  # Remove .csv extension
            
            try:
                # Parse date from filename and convert to YYYY-MM-DD format
                file_date = datetime.strptime(filename, '%m-%d-%Y')
                date_key = file_date.strftime('%Y-%m-%d')
                
                # Load CSV and store in dictionary
                result['trial_balance'][date_key] = pd.read_csv(file)
                
                # Store file info in metadata
                result['metadata']['tb_files'].append({
                    'filename': file.name,
                    'date': date_key,
                    'records': len(result['trial_balance'][date_key])
                })
                
                print(f"  ‚úì Loaded: {file.name} -> {date_key} ({len(result['trial_balance'][date_key])} records)")
                
            except ValueError:
                # File doesn't follow naming convention
                non_compliant_files.append(file.name)
                print(f"  ‚ö†Ô∏è  WARNING: File does not follow naming convention (MM-DD-YYYY.csv): {file.name}")
        
        # Store non-compliant files in metadata if any
        if non_compliant_files:
            result['metadata']['non_compliant_files'] = non_compliant_files
        
        print(f"\nüìä Total Trial Balance files loaded: {len(result['trial_balance'])}")
        
    else:
        print(f"‚ö†Ô∏è  WARNING: Trial Balance folder not found: {tb_folder}")
    
    # ========== Load Chart of Accounts ==========
    if coa_folder.exists():
        print(f"\nüìÇ Loading Chart of Accounts from: {coa_folder}")
        
        csv_files = list(coa_folder.glob('*.csv'))
        
        # Validate number of files
        if not csv_files:
            print(f"  ‚ö†Ô∏è  WARNING: No CSV files found in {coa_folder}")
        elif len(csv_files) > 1:
            print(f"  ‚ö†Ô∏è  WARNING: Multiple files found in Chart of Accounts folder!")
            print(f"              Expected only 1 file, found {len(csv_files)}:")
            for f in csv_files:
                print(f"              - {f.name}")
            print(f"              Loading the first file: {csv_files[0].name}")
        
        # Load first CSV file if available
        if csv_files:
            coa_file = csv_files[0]
            result['chart_of_accounts'] = pd.read_csv(coa_file)
            result['metadata']['coa_file'] = coa_file.name
            print(f"  ‚úì Loaded: {coa_file.name} ({len(result['chart_of_accounts'])} accounts)")
    else:
        print(f"‚ö†Ô∏è  WARNING: Chart of Accounts folder not found: {coa_folder}")
    
    return result

## 4. Load Data

In [None]:
# Load all data
data = load_trial_balance_data()

# print("\n" + "="*60)
# print("üìã DATA LOADING SUMMARY")
# print("="*60)
# print(f"Year: {data['metadata']['year']}")
# print(f"Month: {data['metadata']['month']}")
# print(f"Load Time: {data['metadata']['load_timestamp']}")
# print(f"\nTrial Balance DataFrames: {len(data['trial_balance'])}")
# print(f"Chart of Accounts: {'Loaded' if data['chart_of_accounts'] is not None else 'Not Loaded'}")

# if 'non_compliant_files' in data['metadata']:
#     print(f"\n‚ö†Ô∏è  Non-compliant files: {len(data['metadata']['non_compliant_files'])}")
    
# print("\n" + "="*60)

In [None]:
# Load reference data
reference_data = load_reference_data()

print("\n" + "="*60)
print("üìã REFERENCE DATA LOADING SUMMARY")
print("="*60)
print(f"Load Time: {reference_data['metadata']['load_timestamp']}")
print(f"\nCOA Mapping: {'Loaded' if reference_data['coa_mapping'] is not None else 'Not Loaded'}")
if reference_data['metadata']['coa_mapping_file']:
    print(f"  File: {reference_data['metadata']['coa_mapping_file']}")
print(f"\nPortfolio Mapping: {'Loaded' if reference_data['portfolio_mapping'] is not None else 'Not Loaded'}")
if reference_data['metadata']['portfolio_mapping_file']:
    print(f"  File: {reference_data['metadata']['portfolio_mapping_file']}")
print("\n" + "="*60)

## 5. Separate Data by Source

In [None]:
# Separate data into distinct variables based on folder structure

# Trial Balance data (dictionary of DataFrames by date)
trial_balance_data = data['trial_balance']

# Chart of Accounts data (single DataFrame)
chart_of_accounts = data['chart_of_accounts']

# Metadata
metadata = data['metadata']

# Reference data
coa_mapping = reference_data['coa_mapping']
portfolio_mapping = reference_data['portfolio_mapping']

print("‚úì Data separated successfully")
print(f"\nüìä Trial Balance: {len(trial_balance_data)} date(s)")
print(f"üìä Chart of Accounts: {len(chart_of_accounts) if chart_of_accounts is not None else 0} account(s)")
print(f"üìä COA Mapping: {len(coa_mapping) if coa_mapping is not None else 0} mapping(s)")
print(f"üìä Portfolio Mapping: {len(portfolio_mapping) if portfolio_mapping is not None else 0} mapping(s)")
print(f"üìä Metadata: {list(metadata.keys())}")

## 6. Add Date Column to Trial Balance Data

In [None]:
# Add 'Date' column to each Trial Balance DataFrame
for date_key, df in trial_balance_data.items():
    df['Date'] = date_key

print("‚úì Date column added to all Trial Balance DataFrames")
print(f"\nProcessed {len(trial_balance_data)} date(s)")

## 7. Consolidate Trial Balance Data

In [None]:
# Consolidate all Trial Balance DataFrames into a single DataFrame
trial_balance_consolidated = pd.concat(trial_balance_data.values(), ignore_index=True)

print("‚úì Trial Balance data consolidated")
print(f"\nTotal records: {len(trial_balance_consolidated):,}")
print(f"Date range: {trial_balance_consolidated['Date'].min()} to {trial_balance_consolidated['Date'].max()}")
print(f"Unique dates: {trial_balance_consolidated['Date'].nunique()}")
print(f"\nColumns: {trial_balance_consolidated.columns.tolist()}")

In [None]:
# len(trial_balance_consolidated['Date'].unique())

trial_balance_consolidated

## 8. Create Pivot Table

In [None]:
# Create pivot table
trial_balance_pivot_table = trial_balance_consolidated.pivot_table(
    index='accountname',           # Rows: GL Account
    columns='level1accountname',   # Columns: Fund Name
    values='netamt',               # Values: Balance
    aggfunc='sum',                 # Sum the netamt
    fill_value=0                   # Fill missing values with 0
)

# Rename index and columns for clarity
trial_balance_pivot_table.index.name = 'GL Account'
trial_balance_pivot_table.columns.name = 'Fund Name'

print("‚úì Pivot table created")
print(f"\nShape: {trial_balance_pivot_table.shape[0]} GL Accounts √ó {trial_balance_pivot_table.shape[1]} Funds")
print(f"Total Balance: {trial_balance_pivot_table.sum().sum():,.2f}")

# Display pivot table
trial_balance_pivot_table

## 9. Match GL Accounts with COA Mapping

In [None]:
# Get GL Accounts from pivot table (index)
pivot_gl_accounts = set(trial_balance_pivot_table.index)

# Get GL Accounts from COA Mapping
coa_gl_accounts = set(coa_mapping['GL Account'])

# Find accounts in pivot table that are NOT in COA Mapping
missing_in_coa = pivot_gl_accounts - coa_gl_accounts

# Find accounts in COA Mapping that are NOT in pivot table
missing_in_pivot = coa_gl_accounts - pivot_gl_accounts

print("="*60)
print("GL ACCOUNT MATCHING ANALYSIS")
print("="*60)
print(f"\nüìä Total GL Accounts in Pivot Table: {len(pivot_gl_accounts)}")
print(f"üìä Total GL Accounts in COA Mapping: {len(coa_gl_accounts)}")
print(f"\n‚úì Matching Accounts: {len(pivot_gl_accounts & coa_gl_accounts)}")
print(f"‚ö†Ô∏è  Accounts in Pivot but NOT in COA Mapping: {len(missing_in_coa)}")
print(f"‚ÑπÔ∏è  Accounts in COA Mapping but NOT in Pivot: {len(missing_in_pivot)}")

# Display missing accounts
if missing_in_coa:
    print("\n" + "="*60)
    print("‚ö†Ô∏è  NEW ACCOUNTS FOUND (Need to be added to COA Mapping):")
    print("="*60)
    for i, account in enumerate(sorted(missing_in_coa), 1):
        print(f"{i:3}. {account}")
else:
    print("\n‚úì All accounts in pivot table exist in COA Mapping!")

# Create indicator DataFrame for new accounts
if missing_in_coa:
    new_accounts_df = pd.DataFrame({
        'GL Account': sorted(missing_in_coa),
        'Status': 'NEW - Not in COA Mapping',
        'TB Account Name': '',
        'Account Type': '',
        'FS Classification': ''
    })
    
    print(f"\nüìù Created DataFrame with {len(new_accounts_df)} new account(s) to be added")
    print("    Variable: new_accounts_df")
else:
    new_accounts_df = None
    print("\n‚úì No new accounts to add")

In [None]:
# Display new accounts DataFrame
if new_accounts_df is not None:
    print(f"üìã New Accounts to Add to COA Mapping ({len(new_accounts_df)} accounts):\n")
    display(new_accounts_df)
else:
    print("‚úì No new accounts found")

In [None]:
# Create updated COA Mapping with new accounts inserted
if new_accounts_df is not None:
    # Combine original COA mapping with new accounts
    updated_coa_mapping = pd.concat([coa_mapping, new_accounts_df], ignore_index=True)
    
    # Sort by GL Account for better organization
    updated_coa_mapping = updated_coa_mapping.sort_values('GL Account').reset_index(drop=True)
    
    print("‚úì Updated COA Mapping created with new accounts")
    print(f"\nüìä Original COA Mapping: {len(coa_mapping)} accounts")
    print(f"üìä New Accounts Added: {len(new_accounts_df)} accounts")
    print(f"üìä Updated COA Mapping: {len(updated_coa_mapping)} accounts")
    print(f"\nüíæ Variable: updated_coa_mapping")
    
    # Create indicator column to show which accounts are new
    updated_coa_mapping['Is_New_Account'] = updated_coa_mapping['GL Account'].isin(missing_in_coa)
    
    print(f"\n‚úì Added 'Is_New_Account' indicator column")
    print(f"   - True: Account is newly found (not in original COA Mapping)")
    print(f"   - False: Account existed in original COA Mapping")
else:
    updated_coa_mapping = coa_mapping.copy()
    updated_coa_mapping['Is_New_Account'] = False
    print("‚úì No new accounts to add - using original COA Mapping")

In [None]:
# Display updated COA Mapping - showing only new accounts
print("üìã Updated COA Mapping - New Accounts Only:\n")
display(updated_coa_mapping[updated_coa_mapping['Is_New_Account'] == True])

In [None]:
# Export updated COA Mapping if new accounts were added
if new_accounts_df is not None and len(new_accounts_df) > 0:
    # Define export path
    export_folder = Path('../data/references/COA Mapping')
    export_folder.mkdir(parents=True, exist_ok=True)
    
    # Create filename with current date (MM.DD.YYYY format)
    current_date = datetime.now().strftime('%m.%d.%Y')
    export_filename = f'Chart of Accounts Mapping as of {current_date}.xlsx'
    export_path = export_folder / export_filename
    
    # Export to Excel
    updated_coa_mapping.to_excel(export_path, index=False, engine='openpyxl')
    
    print("="*60)
    print("üì§ EXPORT SUCCESSFUL")
    print("="*60)
    print(f"‚úì File exported to: {export_path}")
    print(f"‚úì Filename: {export_filename}")
    print(f"‚úì Total records: {len(updated_coa_mapping)}")
    print(f"‚úì New accounts added: {len(new_accounts_df)}")
    print(f"‚úì Export timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("\nüí° Note: The 'Is_New_Account' column indicates which accounts are newly added (True)")
else:
    print("‚ÑπÔ∏è  No new accounts to export - COA Mapping unchanged")

In [None]:
chart_of_accounts

In [None]:
coa_mapping

In [None]:
portfolio_mapping

## 10. Automation Workflow - [Next Steps]

In [None]:
# TODO: Add automation logic here
# - Validation
# - Reconciliation
# - Report generation
# - Export processed data

print("Ready for automation workflow implementation")