In [1]:
# python 3.9.7

import pandas as pd 
import tabula # pip install tabula-py --user # pip install JPype1 --user
import numpy as np 
from pathlib import Path
from datetime import datetime
import re

In [2]:
%run ./common_utils.ipynb

Importing following functions: normalize_date_col(), normalize_str_col(), get_all_files()


##### Notebook Functions

In [3]:
def process_df(df):
#   Function name: process_df
#   Description: This function is used to process raw pandas dataframes with client information
#   Parameters: df
#        df(pandas dataframe): The input df that will be transformed
#   Return values: df
#        df(pandas dataframe): The output df with transformed client information

    client_id_col = '' # Have to initialize it
    for col in df.columns:
        try:
            if check_invoice_id(df, col) is True: 
                invoice_id_col = col # Find invoice_id column
        except:
            pass
        
        if check_dt_col(df, col) is True: 
            date_col = col # find date column
            
        if check_shpmnt_col(df, col) is True: 
            shpmnt_col = col # find shipment_type column    
            
        try:
            if not client_id_col and check_client_id(df, col) is True:
                client_id_col = col # Find client_id column
        except:
            pass
        
    try: 
        total_col = check_total_col(df) # find total_amount column  
    except:
        pass
        
            
    print(f"invoice_id_col: {invoice_id_col} / date_col: {date_col} / shpmnt_col: {shpmnt_col} / total_col: {total_col}")
    
    # Column transformations
    df = normalize_date_col(df, date_col)
    df = df.rename(columns={invoice_id_col: "invoice_id", date_col: "invoice_date", shpmnt_col: "shipment_type", client_id_col: 'client_id', total_col: 'total_amt'}).drop_duplicates()
    
    return df

In [4]:
def check_invoice_id(df, col):
#   Function name: check_invoice_id
#   Description: This function is used to find the 'invoice_id' column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_inv_id(Boolean): The result of whether input column was a invoice_id column or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()

    is_inv_id = False
    for i in values:
        inv = i.split("-")[0]
        alphanum_chars = i.split("-")[1]
        chk_chars = "".join(re.findall(r'[0-9A-Za-z]', alphanum_chars))

        if inv == 'INV' and len(chk_chars) == 7:
            is_inv_id = True

    return is_inv_id

In [5]:
def check_shpmnt_col(df, col):
#   Function name: check_shpmnt_col
#   Description: This function is used to find the shipment_type column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_cli_id(Boolean): The result of whether input column was a shipment_type column or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()   

    is_shpmnt_col = False
    for i in values:
        if i in ['GROUND', '2DAY', 'EXPRESS', 'FREIGHT']:
            is_shpmnt_col = True
    
    return is_shpmnt_col

In [6]:
def check_dt_col(df, col):
#   Function name: check_dt_col
#   Description: This function is used to find the date type column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_cli_id(Boolean): The result of whether input column was a date type or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()   
    date_formats = ['%d-%b-%Y', '%m/%d/%Y', '%Y-%m-%d', '%b %d, %Y', '%Y/%m/%d', '%Y/%m/%d %H:%M:%S']

    is_dt_col = False
    for dt in values:
        for fmt in date_formats:
            try:
                datetime.strptime(dt, fmt)
                is_dt_col = True
            except:
                pass
    
    return is_dt_col

In [7]:
def check_total_col(df):
#   Function name: check_total_col
#   Description: This function is used to find the total_amount column
#   Parameters: df
#        df(pandas dataframe): The input df that is being checked
#   Return values: df
#        col(str): The total_amount column


    # CHATGPT: regex for USD-style values: starts with $ or USD, optional commas, 2 decimals
    usd_pattern = r'^\$?\s?(USD)?\s?\d{1,3}(,\d{3})*(\.\d{2})?$|^\$?\s?(USD)?\s?\d+(\.\d{2})?$'
    df_numeric = df.select_dtypes(include=["number"])
    
    for col in df_numeric.columns:
        df_numeric["is_usd"] = df_numeric[col].astype(str).str.strip().str.upper().str.match(usd_pattern)
    df_numeric = df_numeric[(df_numeric.is_usd == True)].drop('is_usd', axis=1)

    if df_numeric.shape[1] == 1:
        col = df_numeric.columns[0]
    elif df_numeric.shape[1] > 1:
        sums = df_numeric.sum()
        col = sums.idxmax()
    else:
        col = ''
    
    return col

In [8]:
def check_client_id(df, col):
#   Function name: check_client_id
#   Description: This function is used to find the 'client_id' column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_cli_id(Boolean): The result of whether input column was a client_id column or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()

    is_cli_id = False
    for i in values:
        char = i[:1]
        numbers = i[1:]

        if char == 'C' and len(numbers) == 5:
            try:
                int(numbers)
                is_cli_id = True
            except ValueError:
                pass

    return is_cli_id

In [9]:
def validate_save_df(df, file):
#   Function name: validate_save_df
#   Description: This function is used to validate the df and save it if validations pass
#   Parameters: df, file
#        df(pandas dataframe): The input df that is being validated
#        file(str): Name of the file

    vldtn_passed = True
    
    # invoice_id Validation check            
    for i in df.invoice_id.unique().tolist():
        inv = i.split("-")[0]
        alphanum_chars = i.split("-")[1]
        chk_chars = "".join(re.findall(r'[0-9A-Za-z]', alphanum_chars))

        if inv != 'INV' or len(chk_chars) != 7:
            vldtn_passed = False
            print(f"invoice_id doesn't follow INV-ABC1234 formatting! invoice_id Value: {i}")
            continue
            
    # ===================
    # Add more test cases
    # ===================
    
    # Handle results
    if vldtn_passed == True:
        display(df)
        df.to_csv(f"""../output_tables/{file.split(".")[0]}.csv""", index=False)
        print(f'Validation tests passed. Saved {file.split(".")[0]}.csv to ../output_tables/{file.split(".")[0]}.csv!')
    else:
        print(f"{file} failed validation check.") 

##### Main

In [10]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    folder_location = "../src/invoices"
    
    file_names = get_all_files('invoices')
    for file in file_names:
        if file.split(".")[-1] == 'pdf':
            # Create pandas table from pdf source
            tabula_read = tabula.read_pdf(f"{folder_location}/{file}", pages='all', multiple_tables=True, force_subprocess=True)
            df_pdf = pd.DataFrame()
            
            for i, table in enumerate(tabula_read):
                df_temp = tabula_read[i]
                df_pdf = pd.concat([df_pdf, df_temp]).reset_index(drop=True)
                
            print(f"\nfile name: {file}")
            df = process_df(df_pdf)
            validate_save_df(df, file)
            
        elif file.split(".")[-1] == 'csv':
            df_csv = pd.read_csv(f"{folder_location}/{file}")
            print(f"\nfile name: {file}")
            df = process_df(df_csv)
            validate_save_df(df, file)
          
        else:
            print('Invalid file type.')
    
main()


file name: invoices_MOCK.csv
invoice_id_col: invoice_id / date_col: invoice_date / shpmnt_col: shipment_type / total_col: amount
invoice_id doesn't follow INV-ABC1234 formatting! invoice_id Value: INV-N9Z2HI7fsd
invoices_MOCK.csv failed validation check.

file name: invoices_v1.csv
invoice_id_col: invoice_id / date_col: invoice_date / shpmnt_col: shipment_type / total_col: amount


Unnamed: 0,invoice_id,client_id,invoice_date,total_amt,currency,shipment_type
0,INV-ITBHR2P,C10456,2025-05-22,5235.33,USD,2DAY
1,INV-LWC0YM6,C16655,2024-04-19,3015.04,USD,GROUND
2,INV-7TDN2MF,C63096,2025-07-28,3023.94,USD,GROUND
3,INV-N9Z2HI7,C15499,2025-05-30,3253.71,USD,GROUND
4,INV-M8ZSF57,C93089,2024-07-14,3526.16,USD,GROUND
...,...,...,...,...,...,...
11995,INV-5LPB2GE,C63096,2025-12-06,3191.81,USD,GROUND
11996,INV-CE1RRD5,C86029,2025-09-05,14820.41,USD,EXPRESS
11997,INV-SU0IUIL,C57382,2025-01-26,2571.51,USD,GROUND
11998,INV-FH9UUJM,C17172,2025-05-19,40913.22,USD,FREIGHT


Validation tests passed. Saved invoices_v1.csv to ../output_tables/invoices_v1.csv!

file name: invoices_v2.csv
invoice_id_col: inv_no / date_col: inv_dt / shpmnt_col: ship_type / total_col: total


Unnamed: 0,invoice_id,client_id,invoice_date,subtotal,tax,total_amt,curr,shipment_type
0,INV-ITBHR2P,C10456,2025-05-22,5235.33,418.83,5654.16,USD,2DAY
1,INV-LWC0YM6,C16655,2024-04-19,3015.04,150.75,3165.79,USD,GROUND
2,INV-7TDN2MF,C63096,2025-07-28,3023.94,302.39,3326.33,USD,GROUND
3,INV-N9Z2HI7,C15499,2025-05-30,3253.71,260.30,3514.01,USD,GROUND
4,INV-M8ZSF57,C93089,2024-07-14,3526.16,176.31,3702.47,USD,GROUND
...,...,...,...,...,...,...,...,...
11995,INV-5LPB2GE,C63096,2025-12-06,3191.81,319.18,3510.99,USD,GROUND
11996,INV-CE1RRD5,C86029,2025-09-05,14820.41,741.02,15561.43,USD,EXPRESS
11997,INV-SU0IUIL,C57382,2025-01-26,2571.51,128.58,2700.09,USD,GROUND
11998,INV-FH9UUJM,C17172,2025-05-19,40913.22,2045.66,42958.88,USD,FREIGHT


Validation tests passed. Saved invoices_v2.csv to ../output_tables/invoices_v2.csv!

file name: invoices_v3.csv
invoice_id_col: invoice_uid / date_col: issued_on / shpmnt_col: shipment_category / total_col: amount_usd


Unnamed: 0,invoice_id,client_ref,invoice_date,total_amt,shipment_type
0,INV-ITBHR2P,BLUE PARTNERS,2025-05-22,5654.16,2DAY
1,INV-LWC0YM6,HOOLI CO,2024-04-19,3165.79,GROUND
2,INV-7TDN2MF,INITECH FREIGHT,2025-07-28,3326.33,GROUND
3,INV-N9Z2HI7,ZENITH HOLDINGS,2025-05-30,3514.01,GROUND
4,INV-M8ZSF57,BLUE SUPPLY,2024-07-14,3702.47,GROUND
...,...,...,...,...,...
11995,INV-5LPB2GE,INITECH FREIGHT,2025-12-06,3510.99,GROUND
11996,INV-CE1RRD5,ACME INDUSTRIES,2025-09-05,15561.43,EXPRESS
11997,INV-SU0IUIL,VECTOR LOGISTICS,2025-01-26,2700.09,GROUND
11998,INV-FH9UUJM,APEX FREIGHT,2025-05-19,42958.88,FREIGHT


Validation tests passed. Saved invoices_v3.csv to ../output_tables/invoices_v3.csv!
