In [180]:
import argparse
from datetime import datetime, timedelta
import ipynbname
from pathlib import Path
import polars as pl
import sys
from re import compile

pl.Config.set_tbl_rows(100)

# -----------------------------------------------------------------------------------------
# creating the the funcionality for choosing a date when running the program
# -----------------------------------------------------------------------------------------

prompt_date = input(f'Enter a date in this format "yyyy-mm-dd" (range: {min_date} - {max_date}): ')

# create the requried pattern to check against the user prompt
date_pattern = compile('^20\d{2}-[01]{1}[0-9]{1}-[0-3]{1}\d{1}$')

if date_pattern.match(prompt_date):
    prompt_date = datetime.strptime(prompt_date, "%Y-%m-%d").date()
    if prompt_date < min_date or prompt_date > max_date:
       print('Selected date is out of range')
       sys.exit()
else:
    print("Enter a date in the right format")
    sys.exit()


# -----------------------------------------------------------------------------------------
# load dataset into a dataframe
# -----------------------------------------------------------------------------------------

py_cwd = ipynbname.path().parent     
input_filepath = py_cwd.joinpath('Account Statements.csv')

# load the data using the lazy method
# group by account_number and balance data to get the correct balance and trans_value per day

df = ( pl.scan_csv(input_filepath, sep=';', 
                 with_column_names=lambda cols: [col.lower().replace(' ', '_') for col in cols],
                 # for some reason, polars interprets null values in the dates cols as '1900-01-00',
                 # therefore using null_values argument
                 null_values='1900-01-00') 
                .groupby(['account_number', 'balance_date'])
                         .agg([
                                pl.sum('balance'),
                                pl.sum('transaction_value')       
                         ])
     ).collect()


# cast the balance date from string to date column 

df = df.with_columns(pl.col('balance_date').str.strptime(pl.Date, '%d/%m/%Y')) \
       .with_column(pl.col('balance_date').rank('ordinal', reverse=False)
                                          .over(pl.col('account_number'))
                                          .alias('row_number'))

test = df.filter(pl.col('account_number').is_in([57886631,35712712]))


# -----------------------------------------------------------------------------------------
# create a date range where min - max dates (inclusive) and convert this to a df
# -----------------------------------------------------------------------------------------

# get the max and min date in the data frame
max_date = df.select('balance_date').to_series().max()
min_date = df.select('balance_date').to_series().min()

date_range = pl.DataFrame({'date': pl.date_range(min_date, max_date, '1d')})
df_accounts = df.select(pl.col('account_number').unique())

# cross join to have for each account 15 days
df_crossed = date_range.join(df_accounts, how='cross')


# scaffold the df_dates with the main dataframe so each account number has 15 dates
df_all = df_crossed.join(df, 
                 left_on=['date','account_number'], 
                 right_on=['balance_date', 'account_number'], 
                 how='left')

# fill forward to replace null values for some cols
df_all = ( df_all.sort([pl.col('account_number'), pl.col('date')], reverse=[False, False])
              .with_columns([
                                #pl.col('account_number').forward_fill(),
                                pl.col('balance').forward_fill(),
                                pl.col('transaction_value').fill_null(0)
                            ])
              .drop('row_number')
        )



# -----------------------------------------------------------------------------------------
# output the file as a csv file
# -----------------------------------------------------------------------------------------

df_output = df_all.filter(pl.col('date') == prompt_date)
df_output.write_csv(f'{py_cwd}/output-{str(prompt_date)}-py-sol.csv')
