In [1]:
# python 3.9.7

import pandas as pd 
import tabula # pip install tabula-py --user # pip install JPype1 --user
import numpy as np 
from pathlib import Path
from datetime import datetime

In [2]:
%run ./common_utils.ipynb

Importing following functions: normalize_date_col(), normalize_str_col(), get_all_files()


##### Notebook Functions

In [3]:
def process_df(df):
#   Function name: process_df
#   Description: This function is used to process raw pandas dataframes with client information
#   Parameters: df
#        df(pandas dataframe): The input df that will be transformed
#   Return values: df
#        df(pandas dataframe): The output df with transformed client information

    for col in df.columns:
        if check_client_id(df, col) is True: 
            client_id_col = col # Find client_id column
        
        if 'name' in col:
            name_col = col # Find company name column

        if check_dt_col(df, col) is True: 
            date_col = col # find date column
            
    other_cols = []
    for col in df.columns:
        if col not in [client_id_col, name_col, date_col]:
            other_cols.append(col) # Find remaining columns
            
    print(f"client_id_col: {client_id_col} / name_col: {name_col} / date_col: {date_col} / other_columns: {other_cols}")
    
    # Column transformations
    df = normalize_date_col(df, date_col)
    
    for col in other_cols:
        df = normalize_str_col(df, col, True)
        
    df = df.rename(columns={client_id_col: "client_id", name_col: "company_name", date_col: "cli_join_dt"}).drop_duplicates()
    
    return df

In [4]:
def check_client_id(df, col):
#   Function name: check_client_id
#   Description: This function is used to find the 'client_id' column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_cli_id(Boolean): The result of whether input column was a client_id column or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()

    is_cli_id = False
    for i in values:
        char = i[:1]
        numbers = i[1:]

        if char == 'C' and len(numbers) == 5:
            try:
                int(numbers)
                is_cli_id = True
            except ValueError:
                pass

    return is_cli_id

In [5]:
def check_dt_col(df, col):
#   Function name: check_dt_col
#   Description: This function is used to find the date type column
#   Parameters: df, col
#        df(pandas dataframe): The input df that is being checked
#        col(str): The column that is being checked
#   Return values: df
#        is_cli_id(Boolean): The result of whether input column was a date type or not

    df = df[~(df[col].isnull())] # Remove nulls
    values = df[col].unique().tolist()   
    date_formats = ['%d-%b-%Y', '%m/%d/%Y', '%Y-%m-%d', '%b %d, %Y', '%Y/%m/%d', '%Y/%m/%d %H:%M:%S']

    is_dt_col = False
    for dt in values:
        for fmt in date_formats:
            try:
                datetime.strptime(dt, fmt)
                is_dt_col = True
            except:
                pass
    
    return is_dt_col

In [6]:
def validate_save_df(df, file):
#   Function name: validate_save_df
#   Description: This function is used to validate the df and save it if validations pass
#   Parameters: df, file
#        df(pandas dataframe): The input df that is being validated
#        file(str): Name of the file

    vldtn_passed = True
    
    # Client_id Validation check
    for i in df.client_id.unique().tolist():
        char = i[:1]
        numbers = i[1:]

        if char != 'C' or len(numbers) != 5:
            vldtn_passed = False
            print(f"Client_id doesn't follow C01234 formatting! Client_id Value: {i}")
            continue
        try:
            int(numbers)
        except ValueError:
            vldtn_passed = False
            print(f"Client_id doesn't follow C01234 formatting! Client_id Value: {i}")
            
    # ===================
    # Add more test cases
    # ===================
    
    # Handle results
    if vldtn_passed == True:
        display(df)
        df.to_csv(f"""../output_tables/{file.split(".")[0]}.csv""", index=False)
        print(f'Validation tests passed. Saved {file.split(".")[0]}.csv to ../output_tables/{file.split(".")[0]}.csv!')
    else:
        print(f"{file} failed validation check.") 

##### Main

In [7]:
def main():
#   Function name: main
#   Description: The entry function of the notebook

    folder_location = "../src/clients"
    
    file_names = get_all_files('clients')
    for file in file_names:
        if file.split(".")[-1] == 'pdf':
            # Create pandas table from pdf source
            tabula_read = tabula.read_pdf(f"{folder_location}/{file}", pages='all', multiple_tables=True, force_subprocess=True)
            df_pdf = pd.DataFrame()
            
            for i, table in enumerate(tabula_read):
                df_temp = tabula_read[i]
                df_pdf = pd.concat([df_pdf, df_temp]).reset_index(drop=True)
                
            print(f"\nfile name: {file}")
            df = process_df(df_pdf)
            validate_save_df(df, file)
            
        elif file.split(".")[-1] == 'csv':
            df_csv = pd.read_csv(f"{folder_location}/{file}")
            print(f"\nfile name: {file}")
            df = process_df(df_csv)
            validate_save_df(df, file)
          
        else:
            print('Invalid file type.')
    
main()


file name: clients_MOCK.csv
client_id_col: customer_key / name_col: display_name / date_col: signup_ts / other_columns: ['active_flag', 'currency']
Client_id doesn't follow C01234 formatting! Client_id Value: Cdjao
clients_MOCK.csv failed validation check.

file name: clients_v1.pdf
client_id_col: client_id / name_col: client_name / date_col: created_at / other_columns: ['status']


Unnamed: 0,client_id,company_name,status,cli_join_dt
0,C15499,Zenith Holdings,ACTIVE,2020-05-17
1,C64279,Green Logistics,ACTIVE,2022-11-06
2,C94929,Green Co,ACTIVE,2021-11-27
3,C43677,Apex Group,INACTIVE,2019-05-24
4,C97125,Blue Industries,ACTIVE,2021-09-14
5,C10456,Blue Partners,INACTIVE,2023-11-03
6,C77726,Nimbus Holdings,ACTIVE,2021-09-25
7,C63250,Vertex Supply,ACTIVE,2020-08-24
8,C16655,hooli co,ACTIVE,2021-05-06
9,C98672,Stark Freight,ACTIVE,2019-04-22


Validation tests passed. Saved clients_v1.csv to ../output_tables/clients_v1.csv!

file name: clients_v2.csv
client_id_col: id / name_col: name / date_col: acct_open_date / other_columns: ['tier']


Unnamed: 0,client_id,company_name,tier,cli_join_dt
0,C15499,Zenith Holdings,BRONZE,2020-05-17
1,C64279,Green Logistics,SILVER,2022-11-06
2,C94929,Green Co,BRONZE,2021-11-27
3,C43677,Apex Group,GOLD,2019-05-24
4,C97125,Blue Industries,BRONZE,2021-09-14
...,...,...,...,...
56,C28306,Tyrell LLC,BRONZE,2021-05-23
57,C21531,Green LLC,GOLD,2021-04-07
58,C86029,Acme Industries,BRONZE,2020-12-02
59,C34523,Stark Enterprises,BRONZE,2019-09-26


Validation tests passed. Saved clients_v2.csv to ../output_tables/clients_v2.csv!

file name: clients_v3.csv
client_id_col: customer_key / name_col: display_name / date_col: signup_ts / other_columns: ['active_flag', 'currency']


Unnamed: 0,client_id,company_name,active_flag,cli_join_dt,currency
0,C15499,ZENITH HOLDINGS,Y,2020-05-17 17:07:28,USD
1,C64279,GREEN LOGISTICS,Y,2022-11-06 01:10:55,USD
2,C94929,GREEN CO,Y,2021-11-27 18:47:49,USD
3,C43677,APEX GROUP,N,2019-05-24 20:16:09,USD
4,C97125,BLUE INDUSTRIES,Y,2021-09-14 12:16:12,USD
5,C10456,BLUE PARTNERS,N,2023-11-03 02:44:17,USD
6,C77726,NIMBUS HOLDINGS,Y,2021-09-25 22:05:52,USD
7,C63250,VERTEX SUPPLY,Y,2020-08-24 08:02:07,USD
8,C16655,HOOLI CO,Y,2021-05-06 15:30:30,USD
9,C98672,STARK FREIGHT,Y,2019-04-22 06:46:53,USD


Validation tests passed. Saved clients_v3.csv to ../output_tables/clients_v3.csv!
