In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import sys, os

# add relevant folders to sys path (only needed for jupyter)
proj_path = Path().cwd().parent
for folder in next(os.walk(proj_path))[1]:
    if '.' not in folder: # ignore hidden folders like .git
        path_to_add = os.path.join(proj_path,folder)
        sys.path.append(path_to_add)

from augment_transaction_data import AugmentTransactionData

In [42]:
def remove_sensitive_data(transaction_data:pd.DataFrame,specific_columns:list):
    """_summary_

    Args:
        transaction_data (pd.DataFrame): _description_
        specific_columns (_type_): _description_

    Returns:
        _type_: _description_
    """
    default_cols = ['accountNumber', 'mutationcode', 'transactiondate', 'valuedate', 'startsaldo', 'endsaldo']
    modified_cols = default_cols + specific_columns

    if not specific_columns:
        transaction_data.drop(columns=default_cols,inplace=True)
    else:
        transaction_data.drop(columns=modified_cols,inplace=True)
    
    return transaction_data

In [43]:
# Initialize class and load data
atd = AugmentTransactionData(proj_path) # initialize class
transactions = atd.get_data_with_identifiers() # get transactions with identifiers (SEPA, BEA, ideal etc..)
transactions = remove_sensitive_data(transactions,specific_columns=['amount'])

In [44]:
def get_most_freq_transaction_types(transaction_data:pd.DataFrame,misc_company_threshold=0.05):
    """_summary_

    Args:
        transaction_data (pd.DataFrame): _description_
        misc_company_threshold (float, optional): threshold for tr_type(s) that occur in less than x% of the transactions  
                                                Companies assigned to these transactions are labelled "misc.". Defaults to 0.05.
    """
    # find most frequent transaction types
    freq_info_transaction_type = transaction_data['identifier'].value_counts() # all info

    # Extract transaction types and their frequencies
    tr_type = freq_info_transaction_type.index # Transaction types
    tr_type_count = freq_info_transaction_type.values  # counts

    # get relative freq of transaction types
    tr_type_relative_freq = tr_type_count / np.sum(tr_type_count) 

    # get the most frequent transaction types and their counts
    tr_type_most_freq = tr_type[tr_type_relative_freq > misc_company_threshold]

    return tr_type_most_freq

#  find most frequent transaction types
most_freq_transaction_types = get_most_freq_transaction_types(transaction_data=transactions)

In [45]:
def get_company_labels(transaction_data:pd.DataFrame,most_frequent_tr_type:list):
    """_summary_

    Args:
        transaction_data (pd.DataFrame): _description_
        most_frequent_tr_type (list): _description_

    Returns:
        _type_: _description_
    """
    identifiers = transaction_data['identifier']
    descriptions = transaction_data['description']
    companies = []

    for identifier, description in zip(identifiers,descriptions):
        if identifier in most_frequent_tr_type:
            description_split = description.split()
            if 'SEPA' in description_split[0]:
                for i_str, str in enumerate(description_split):
                        if str == 'Naam:' :
                            companies.append(description_split[i_str + 1])
            elif 'BEA' in description_split[0]:
                for i_str, str in enumerate(description_split):
                    if str == 'Pay':
                        companies.append(description_split[i_str + 1])
        else:
            companies.append('misc.')
            
    return companies

companies = get_company_labels(transaction_data=transactions,most_frequent_tr_type=most_freq_transaction_types)
transactions['company'] = companies
transactions.head(10)

Unnamed: 0,description,identifier,company
0,SEPA iDEAL IBAN: NL31ABN...,SEPA_iDEAL,Thuisbezorgd.nl
1,"BEA, Apple Pay Zettle_*Vief ...",BEA_Apple,Zettle_*Vief
2,"BEA, Apple Pay CCV*World Net...",BEA_Apple,CCV*World
3,"BEA, Apple Pay CCV*World Net...",BEA_Apple,CCV*World
4,"BEA, Apple Pay Kronkel BV,PA...",BEA_Apple,Kronkel
5,SEPA iDEAL IBAN: NL31ABN...,SEPA_iDEAL,Thuisbezorgd.nl
6,SEPA Overboeking IBAN: NL18RAB...,SEPA_Overboeking,SSHn
7,"BEA, Apple Pay Donders,PAS01...",BEA_Apple,"Donders,PAS011"
8,SEPA iDEAL IBAN: NL31ABN...,SEPA_iDEAL,Thuisbezorgd.nl
9,"BEA, Apple Pay Spar Universi...",BEA_Apple,Spar
