#### Single file csv processing

In [4]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
banksdirectory = r'C:\Projects\Finances\Bank statements'

In [49]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import shutil

def hdfc_processing(hdfcdf):
    # Convert each column to numeric
    for column in columns_to_convert_to_numeric:
        hdfcdf[column] = pd.to_numeric(hdfcdf[column], errors= 'coerce')
    #check for NAN values
    df['Debit'] = df['Debit'].fillna(0.0)
    df['Credit'] = df['Credit'].fillna(0.0)
    total_nan_count = hdfcdf['Credit'].isna().sum() + hdfcdf['Debit'].isna().sum() + hdfcdf['Balance'].isna().sum()
    if total_nan_count > 0 :
        raise Exception("Issue with value processing! Check for NaN values in 'Credit', 'Debit', or 'Balance' columns.")
    #Convert Date Colunmn to date format
    hdfcdf['Date'] = pd.to_datetime(hdfcdf['Date'], format='%d/%m/%y')
    
    return hdfcdf


def au_processing(audf):
    #Convert Date Colunmn to date format
    audf['Date'] = pd.to_datetime(audf['Date'], format='%d-%b-%y')

    audf['Credit'] = audf['Credit'].str.replace('-','0')
    audf['Debit'] = audf['Debit'].str.replace('-','0')

    # Convert each column to numeric
    for column in columns_to_convert_to_numeric:
        audf[column] = pd.to_numeric(audf[column], errors= 'coerce')

    #check for NAN values
    total_nan_count = audf['Credit'].isna().sum() + audf['Debit'].isna().sum() + audf['Balance'].isna().sum()

    if total_nan_count > 0 :
        raise Exception("Issue with value processing! Check for NaN values in 'Credit', 'Debit', or 'Balance' columns.")
    
    return audf

def sbi_csv_processing(sbidf):
    total_numeric_na_count = 0
    # Date conversion
    sbidf['Date'] = pd.to_datetime(sbidf['Date'],format="%d-%b-%y", errors='coerce', dayfirst=True)
    date_na_count = sbidf['Date'].isna().sum()

    
    for column in columns_to_convert_to_numeric:
        sbidf[column] = sbidf[column].str.replace(",","")
        #df[column] = df[column].str.replace("","0")   this will add values cannot be done
        sbidf[column] = pd.to_numeric(sbidf[column],errors='coerce')
        if column == 'Balance':
            pass
        else:
            # Identify rows where conversion resulted in NaN
            sbidf.loc[:, column] = sbidf[column].fillna(0.0)
        nacount = pd.to_numeric(sbidf[column],errors='coerce').isna().sum()
        total_numeric_na_count += nacount

    if total_numeric_na_count > 0 or date_na_count > 0 :
        raise Exception(f"error in prossesing {file_path} !!!! \n NaN count breached limit.\n \
                        Total na count = numeric : {total_numeric_na_count} \n \
                                        date : {date_na_count}\
                        ")
    
    return sbidf 

def file_backup(banksdirectory):
    # backup dir and create directory if not exists
    backup_dir = os.path.join (banksdirectory,'backup_parquets')
    os.makedirs(backup_dir , exist_ok= True)

    # Find parquet files
    files = [file for file in os.listdir(banksdirectory) if file.endswith(".parquet")]

    # Move files
    for file in files:
        source = os.path.join(banksdirectory, file)
        destination = os.path.join(backup_dir, file)
        shutil.move(source, destination)
        print(f"Moved {file} to backup directory")


# Get today's date
today_date = datetime.today()

# Convert the date to a string in the format "YYYY-MM-DD"
today_date_str = today_date.strftime("%Y-%m-%d")

banksdirectory = r'C:\Projects\Finances\Bank statements'
directories = [
        entry for entry in os.listdir(banksdirectory)
        if os.path.isdir(os.path.join(banksdirectory, entry))
        and entry not in banksdirectory
        and entry != 'backup_parquets'
    ]
# Move files to bkp directory
file_backup(banksdirectory)

combined_dfs = {}
for folder in directories: 
    files = [
            file for file in os.listdir(os.path.join(banksdirectory, folder))
            if file.endswith(".csv") or file.endswith(".csv")
        ]
    
    column_order = ['Date', 'Description', 'Debit', 'Credit', 'Balance'] #'Value_date',  'Ref_No', 

    global columns_to_convert_to_numeric
    # List of column names to convert to numeric
    columns_to_convert_to_numeric = ['Debit', 'Credit', 'Balance']


    # Create an empty DataFrame with specified column names
    combined_dfs[f'{folder}_combined_df'] = pd.DataFrame(columns=column_order)
    ####  f'{folder}_combined_df' = pd.DataFrame(columns=column_order)
    dfs = []

    for file in files:
        file_path = os.path.join(os.path.join(banksdirectory, folder,file))

        try:

            df = pd.read_csv(file_path, header=0, dtype =str)
            df = df[column_order]
            # Remove leading and trailing whitespace from all string columns
            for column in df.select_dtypes(include=['object']).columns:
                    df[column] = df[column].str.strip()
            #print(f"file {file_path}: \n {df[pd.to_datetime(df['Date'],format="%d-%b-%y", errors='coerce', dayfirst=True).isna()]}")
            if folder == 'sbi':
                df = sbi_csv_processing(df)
            
            if folder == 'hdfc':
                df = hdfc_processing(df)
            
            if folder == 'au':
                df = au_processing(df)
            #appending to dfs list
            dfs.append(df) 

        # combined_df = pd.concat([combined_df,df], ignore_index=True)
            
        except Exception as e:
            print (f'error occured in {file_path} :: \n {str(e)}')
            
# Concatenate DataFrames and store the result back in the dictionary
    try :
        combined_df = pd.concat(dfs, ignore_index=True)
    except Exception as e:
        print(f'Failed to create to df ::: \n Folder :\t {folder} \n {str(e)}')

    output_file_path = f"{banksdirectory}/{folder}_consolidated_pd_{today_date_str}.parquet"

# Write the DataFrame to a Parquet file with the specified file name
    try: 
        combined_df.to_parquet(output_file_path)
        print(f'\n Parquet written ::: {folder} \n')
    except Exception as e:
        print(f'Failed to write to parquet ::: {output_file_path} \n {str(e)}')


#print(df)


Moved au_consolidated_pd_2024-05-19.parquet to backup directory
Moved hdfc_consolidated_pd_2024-05-19.parquet to backup directory
Moved sbi_consolidated_pd_2024-05-19.parquet to backup directory


In [20]:
combined_df = pd.concat(dfs, ignore_index=True)

##### Failure checks

In [40]:
file = r'C:\Projects\Finances\Bank statements\hdfc\hdfc 01-Apr-2024 To 18-May-2024.csv'
df = pd.read_csv(file)

In [42]:
# Convert each column to numeric
for column in columns_to_convert_to_numeric:
    df[column] = pd.to_numeric(df[column], errors= 'coerce')
#check for NAN values
total_nan_count = df['Credit'].isna().sum() + df['Debit'].isna().sum() + df['Balance'].isna().sum()

In [48]:
df['Debit'].isnull().replace('0.0')

  df.isnull().replace('0.0')


Unnamed: 0,Date,Description,Value_date,Ref_No,Debit,Credit,Balance
0,False,False,False,False,False,True,False
1,False,False,False,False,True,False,False
2,False,False,False,False,False,True,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...
122,False,False,False,False,False,True,False
123,False,False,False,False,False,True,False
124,False,False,False,False,False,True,False
125,False,False,False,False,False,True,False


In [41]:
df =hdfc_processing(df)

Exception: Issue with value processing! Check for NaN values in 'Credit', 'Debit', or 'Balance' columns.

##### SBI 

In [210]:
#df

#### Using consolidated parquet file

In [50]:
from datetime import datetime

import os
import numpy as np 
import pandas as pd
banksdirectory = r'C:\Projects\Finances\Bank statements'
files = [ file for file in os.listdir(banksdirectory) if file.endswith(".parquet")]

In [51]:
audf = pd.read_parquet(os.path.join(banksdirectory,files[0]))

##### SBI

In [52]:
sbidf = pd.read_parquet(os.path.join(banksdirectory,files[2]))

#### HDFC


In [53]:
hdfcdf = pd.read_parquet(os.path.join(banksdirectory,files[1]))

In [212]:
hdfcdf

#### SQLite

In [54]:
import sqlite3
from datetime import datetime
db_path = r'C:\Projects\Finances\database\Transactions.db'

In [55]:
def data_load_to_db(df,bank_name):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    # Retrieve existing data from the "transactions" table
    #existing_data = pd.read_sql_query("SELECT * FROM sbi_bank", conn)
    # Truncate the "target" table
    conn.execute(f"DELETE FROM {bank_name}_bank")

    # Insert new data into the "sbi_target" table
    df.to_sql(f'{bank_name}_bank', conn, if_exists='replace', index=False)
    # Commit changes and close connection
    conn.commit()
    conn.close()


In [56]:
data_load_to_db(sbidf,"sbi")
data_load_to_db(hdfcdf,"hdfc")
data_load_to_db(audf,"au")

#### Date table

In [12]:
conn = sqlite3.connect(db_path)
minimun_date = pd.read_sql_query("SELECT min(date) FROM sbi_bank", conn)
# start date
min_date = minimun_date['min(date)'][0]
# end date
end_date = pd.Timestamp('2030-12-31')

# Generate a date range DataFrame
dates = pd.date_range(start=minimun_date , end=end_date , freq="D")

# Extract date attributes

date_df = pd.DataFrame (data=dates, columns=['Date'])

date_df['Day'] = date_df['Date'].dt.day
date_df['Month'] = date_df['Date'].dt.month
date_df['Year'] = date_df['Date'].dt.year

# Determine Indian financial year
def indian_fy(date):
    fy = date.year if date.month >=4 else date.year - 1
    return 'FY '+ str(fy) + ' - ' + str(fy+1)
date_df['Financial_Year'] = date_df['Date'].apply(indian_fy)
#truncate current date table
conn.execute("DELETE FROM date_table") 
# Upload the date table to the database
date_df.to_sql('date_table', conn, if_exists='replace', index=False)

# Automate updating (append new values)
# Example:
# new_dates_query = "SELECT DISTINCT Date FROM sbi_bank WHERE Date > (SELECT max(Date) FROM date_table)"
# new_dates = pd.read_sql_query(new_dates_query, conn)
# new_dates.to_sql('date_table', conn, if_exists='append', index=False)

# Commit changes and close connection
conn.commit()
conn.close()

In [17]:
date_df.dtypes

Date              datetime64[ns]
Day                        int32
Month                      int32
Year                       int32
Financial_Year            object
dtype: object