##### Imports and dataframe configs

In [15]:
from pathlib import Path
import polars as pl

pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(20)

CWD = Path.cwd()

account_holders = CWD.joinpath('input','Account Holders.csv')
account_info = CWD.joinpath('input','Account Information.csv')
transaciton_detail = CWD.joinpath('input','Transaction Detail.csv')
transaction_path = CWD.joinpath('input','Transaction Path.csv')


##### loading the data sets

In [100]:
# transaction path table
df_trans_path = ( pl.scan_csv(transaction_path,
                            with_column_names=lambda cols: [col.lower().replace('_', ' ') 
                                                            for col in cols])
                    .collect() )


# account holders table
df_acnt_holders = ( pl.scan_csv(account_holders, 
                                dtypes={'contact number': pl.Utf8},
                                with_column_names=lambda cols: [col.lower() for col in cols])
                                # amending the contact number by adding '07' at 0 position for every number
                      .with_columns([(pl.lit('07') + pl.col('contact number')).alias('contact number'),
                                     pl.col('date of birth').str.strptime(pl.Date, '%d/%m/%Y')])
                      .collect() )


# transactions detail table
df_trans_detail = ( pl.scan_csv(transaciton_detail, 
                                dtypes={'transaction date': pl.Date},
                                with_column_names=lambda cols: [col.lower() for col in cols]) 
                      # filter the df to only show rows above 1000 and not cancelled transactions
                      .filter((pl.col('value') >= 1000) & (pl.col('cancelled?') == 'N'))
                      .collect()
                )


# acocunt info table
df_acnt_info = ( pl.scan_csv(account_info,
                             dtypes={'balance date': pl.Date, 'account type': pl.Categorical},
                             with_column_names=lambda cols: [col.lower() for col in cols])
                   .filter(pl.col('account type') != 'Platinum')
                   .with_columns(pl.col('account holder id').str.split(by=',',inclusive=False))
                   .explode('account holder id')
                   .with_columns([pl.col('account holder id').str.strip().cast(pl.Int64)])
                   .collect()
             )

# check
#df_acnt_info.select('account type').to_series().value_counts()
#test_joint_number = 10034341
#df_acnt_info.filter(pl.col('account number') == test_joint_number)

df_all = ( df_acnt_info.join(df_acnt_holders, left_on='account holder id', right_on= 'account holder id',how='inner')
                     .join(df_trans_path, left_on='account number', right_on='account from', how='inner')
                     .join(df_trans_detail, left_on='transaction id', right_on='transaction id', how='inner')
                     .drop(['cancelled?', 'account holder id'])
         )
# rename columns to follow snake format
df_all.columns = [col.replace(' ', '_') for col in df_all.columns]

#df_all



##### output the data set into a csv file

In [105]:
output_dir = CWD.joinpath('output')

#make new directory to store the result set
output_dir.mkdir(parents=False, exist_ok=True)

df_all.write_csv(f'{output_dir}/py-solution.csv', sep=',', has_header=True)
