#### Single file csv processing

In [148]:
import pandas as pd
import numpy as np
import os

def hdfc_processing(hdfcdf):
    # Convert each column to numeric
    for column in columns_to_convert_to_numeric:
        hdfcdf[column] = pd.to_numeric(hdfcdf[column], errors= 'coerce')
    #check for NAN values
    total_nan_count = hdfcdf['Credit'].isna().sum() + hdfcdf['Debit'].isna().sum() + hdfcdf['Balance'].isna().sum()
    if total_nan_count > 0 :
        raise Exception("Issue with value processing! Check for NaN values in 'Credit', 'Debit', or 'Balance' columns.")
    #Convert Date Colunmn to date format
    hdfcdf['Date'] = pd.to_datetime(hdfcdf['Date'], format='%d/%m/%y')
    
    return hdfcdf


def au_processing(audf):
    #Convert Date Colunmn to date format
    audf['Date'] = pd.to_datetime(audf['Date'], format='%d-%b-%y')

    audf['Credit'] = audf['Credit'].str.replace('-','0')
    audf['Debit'] = audf['Debit'].str.replace('-','0')

    # Convert each column to numeric
    for column in columns_to_convert_to_numeric:
        audf[column] = pd.to_numeric(audf[column], errors= 'coerce')

    #check for NAN values
    total_nan_count = audf['Credit'].isna().sum() + audf['Debit'].isna().sum() + audf['Balance'].isna().sum()

    if total_nan_count > 0 :
        raise Exception("Issue with value processing! Check for NaN values in 'Credit', 'Debit', or 'Balance' columns.")
    
    return audf

def sbi_csv_processing(sbidf):
    total_numeric_na_count = 0
    # Date conversion
    sbidf['Date'] = pd.to_datetime(sbidf['Date'],format="%d-%b-%y", errors='coerce', dayfirst=True)
    date_na_count = sbidf['Date'].isna().sum()

    
    for column in columns_to_convert_to_numeric:
        sbidf[column] = sbidf[column].str.replace(",","")
        #df[column] = df[column].str.replace("","0")   this will add values cannot be done
        sbidf[column] = pd.to_numeric(sbidf[column],errors='coerce')
        if column == 'Balance':
            pass
        else:
            # Identify rows where conversion resulted in NaN
            sbidf.loc[:, column] = sbidf[column].fillna(0.0)
        nacount = pd.to_numeric(sbidf[column],errors='coerce').isna().sum()
        total_numeric_na_count += nacount

    if total_numeric_na_count > 0 or date_na_count > 0 :
        raise Exception(f"error in prossesing {file_path} !!!! \n NaN count breached limit.\n \
                        Total na count = numeric : {total_numeric_na_count} \n \
                                        date : {date_na_count}\
                        ")
    
    return sbidf 
    

banksdirectory = r'C:\Projects\Finances\Bank statements'
directories = [
        entry for entry in os.listdir(banksdirectory)
        if os.path.isdir(os.path.join(banksdirectory, entry))
        and entry not in banksdirectory
    ]

combined_dfs = {}
for folder in directories: 
    files = [
            file for file in os.listdir(os.path.join(banksdirectory, folder))
            if file.endswith(".csv") or file.endswith(".csv")
        ]
    
    column_order = ['Date', 'Description', 'Debit', 'Credit', 'Balance'] #'Value_date',  'Ref_No', 

    global columns_to_convert_to_numeric
    # List of column names to convert to numeric
    columns_to_convert_to_numeric = ['Debit', 'Credit', 'Balance']


    # Create an empty DataFrame with specified column names
    combined_dfs[f'{folder}_combined_df'] = pd.DataFrame(columns=column_order)
    ####  f'{folder}_combined_df' = pd.DataFrame(columns=column_order)

    for file in files:
        file_path = os.path.join(os.path.join(banksdirectory, folder,file))

        try:

            df = pd.read_csv(file_path, header=0, dtype =str)
            df = df[column_order]
            # Remove leading and trailing whitespace from all string columns
            for column in df.select_dtypes(include=['object']).columns:
                    df[column] = df[column].str.strip()
            #print(f"file {file_path}: \n {df[pd.to_datetime(df['Date'],format="%d-%b-%y", errors='coerce', dayfirst=True).isna()]}")
            if folder == 'sbi':
                df = sbi_csv_processing(df)
            
            if folder == 'hdfc':
                df = hdfc_processing(df)
            
            if folder == 'au':
                df = au_processing(df)
            
            # Concatenate DataFrames and store the result back in the dictionary
            combined_dfs[f'{folder}_combined_df'] = pd.concat([combined_dfs[f'{folder}_combined_df'], df], ignore_index=True)

        # combined_df = pd.concat([combined_df,df], ignore_index=True)
            
        except Exception as e:
            print (f'error occured in {file_path} :: \n {str(e)}')


#print(df)


  combined_dfs[f'{folder}_combined_df'] = pd.concat([combined_dfs[f'{folder}_combined_df'], df], ignore_index=True)
  combined_dfs[f'{folder}_combined_df'] = pd.concat([combined_dfs[f'{folder}_combined_df'], df], ignore_index=True)
  combined_dfs[f'{folder}_combined_df'] = pd.concat([combined_dfs[f'{folder}_combined_df'], df], ignore_index=True)


##### SBI 

In [141]:
directories

['au', 'hdfc', 'sbi']

In [151]:
combined_dfs['sbi_combined_df']

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,2020-04-01,TO TRANSFER-UPI/DR/009203370221/Paytm Vo/PYTM/...,599.0,0.0,43064.41
1,2020-04-04,TO TRANSFER-UPI/DR/009523309725/MD ZEESHAN/PYT...,590.0,0.0,42474.41
2,2020-04-17,WITHDRAWAL TRANSFER---,2000.0,0.0,40474.41
3,2020-04-17,TO TRANSFER-UPI/DR/010828155387/Compass /BDBL/...,1600.0,0.0,38874.41
4,2020-04-17,TO TRANSFER-UPI/DR/010831396121/Paytm Vo/PYTM/...,45.0,0.0,38829.41
...,...,...,...,...,...
602,2024-03-05,BY TRANSFER-UPI/CR/443102979486/SK AMIRU/HDFC/...,0.0,8000.0,14471.86
603,2024-03-05,ATM WDL-ATM CASH 40650 MUKAI NAGAR HINJAWADI ...,8900.0,0.0,5571.86
604,2024-03-12,TO TRANSFER-UPI/DR/407237239807/BENAGIR /INDB/...,1000.0,0.0,4571.86
605,2024-03-22,BY TRANSFER-UPI/CR/408238491225/SK AMIRU/HDFC/...,0.0,500.0,5071.86


In [147]:
df

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,2023-04-01,TO TRANSFER-UPI/DR/345727007000/UPILITE--,200.0,0.0,32971.45
1,2023-04-03,TO TRANSFER-UPI/DR/309367057391/UPILITE--,200.0,0.0,32771.45
2,2023-04-07,CSH DEP (CDM)-CDM3040106+NEW MARKET THANA 622 ...,0.0,9500.0,42271.45
3,2023-04-21,ATM WDL-ATM CASH 5745 +DALHOUSIE SQUARE 622 K...,2000.0,0.0,40271.45
4,2023-04-21,TO TRANSFER-UPI/DR/311124884846/UPILITE--,200.0,0.0,40071.45
...,...,...,...,...,...
83,2024-03-05,BY TRANSFER-UPI/CR/443102979486/SK AMIRU/HDFC/...,0.0,8000.0,14471.86
84,2024-03-05,ATM WDL-ATM CASH 40650 MUKAI NAGAR HINJAWADI ...,8900.0,0.0,5571.86
85,2024-03-12,TO TRANSFER-UPI/DR/407237239807/BENAGIR /INDB/...,1000.0,0.0,4571.86
86,2024-03-22,BY TRANSFER-UPI/CR/408238491225/SK AMIRU/HDFC/...,0.0,500.0,5071.86


In [131]:
sbidf = sbi_csv_processing(df)

In [132]:
sbidf

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,2023-04-01,TO TRANSFER-UPI/DR/345727007000/UPILITE--,200.0,0.0,32971.45
1,2023-04-03,TO TRANSFER-UPI/DR/309367057391/UPILITE--,200.0,0.0,32771.45
2,2023-04-07,CSH DEP (CDM)-CDM3040106+NEW MARKET THANA 622 ...,0.0,9500.0,42271.45
3,2023-04-21,ATM WDL-ATM CASH 5745 +DALHOUSIE SQUARE 622 K...,2000.0,0.0,40271.45
4,2023-04-21,TO TRANSFER-UPI/DR/311124884846/UPILITE--,200.0,0.0,40071.45
...,...,...,...,...,...
83,2024-03-05,BY TRANSFER-UPI/CR/443102979486/SK AMIRU/HDFC/...,0.0,8000.0,14471.86
84,2024-03-05,ATM WDL-ATM CASH 40650 MUKAI NAGAR HINJAWADI ...,8900.0,0.0,5571.86
85,2024-03-12,TO TRANSFER-UPI/DR/407237239807/BENAGIR /INDB/...,1000.0,0.0,4571.86
86,2024-03-22,BY TRANSFER-UPI/CR/408238491225/SK AMIRU/HDFC/...,0.0,500.0,5071.86


In [122]:
df['Debit'] = df['Debit'].str.replace(",","")
#df['Debit'] = df[column].str.replace("","0")
df['Debit'] = pd.to_numeric(df['Debit'],errors='coerce')



# Replace empty strings with NaN in rows where the conversion resulted in NaN
#df.loc[nan_rows & (df['Debit'].astype(str) == ''), 'Date'] = 0.0

In [123]:
df

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,01-Apr-23,TO TRANSFER-UPI/DR/345727007000/UPILITE--,200.0,,32971.45
1,03-Apr-23,TO TRANSFER-UPI/DR/309367057391/UPILITE--,200.0,,32771.45
2,07-Apr-23,CSH DEP (CDM)-CDM3040106+NEW MARKET THANA 622 ...,0.0,9500.00,42271.45
3,21-Apr-23,ATM WDL-ATM CASH 5745 +DALHOUSIE SQUARE 622 K...,2000.0,,40271.45
4,21-Apr-23,TO TRANSFER-UPI/DR/311124884846/UPILITE--,200.0,,40071.45
...,...,...,...,...,...
83,05-Mar-24,BY TRANSFER-UPI/CR/443102979486/SK AMIRU/HDFC/...,0.0,8000.00,14471.86
84,05-Mar-24,ATM WDL-ATM CASH 40650 MUKAI NAGAR HINJAWADI ...,8900.0,,5571.86
85,12-Mar-24,TO TRANSFER-UPI/DR/407237239807/BENAGIR /INDB/...,1000.0,,4571.86
86,22-Mar-24,BY TRANSFER-UPI/CR/408238491225/SK AMIRU/HDFC/...,0.0,500,5071.86


#### Using consolidated parquet file

In [13]:
import os
import numpy as np 
import pandas as pd
banksdirectory = r'C:\Projects\Finances\Bank statements'
files = [ file for file in os.listdir(banksdirectory) if file.endswith(".parquet")]

In [4]:
files

['au_consolidated_2024-05-14.parquet',
 'hdfc_consolidated_2024-05-14.parquet',
 'sbi_consolidated_2024-05-14.parquet']

In [5]:
audf = pd.read_parquet(os.path.join(banksdirectory,files[0]))

In [6]:
audf 

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,06-Jun-23,UPI/CR/352327922572/SK AMIRUL ISLAM/HDFC/50100...,-,1,1.00
1,06-Jun-23,UPI/CR/352327958541/SK AMIRUL ISLAM/HDFC/50100...,-,44164,44165.00
2,06-Jun-23,INITIAL PAYIN FD2303262025011831/1 SK. AMIRUL ...,42843,-,1322.00
3,07-Jun-23,UPI/CR/315807326453/SK AMIRUL ISLAM/HDFC/50100...,-,160,1482.00
4,08-Jun-23,UPI/CR/352516772665/SK AMIRUL ISLAM/HDFC/50100...,-,1129,2611.00
...,...,...,...,...,...
160,27-Mar-24,UPI/DR/408715163616/FARHEEN HALDER/PUNB/071801...,15000,-,518331.96
161,27-Mar-24,UPI/DR/408715351666/BENAGIR KHATUN/INDB/100081...,15000,-,503331.96
162,27-Mar-24,UPI/DR/408715845500/PUJA MAITY/HDFC/5010034853...,15000,-,488331.96
163,27-Mar-24,UPI/DR/408715168548/MOHAMMED HAMZA ZAKARIA/AUB...,15000,-,473331.96


In [9]:
audf = au_processing(audf)

In [11]:
audf.dtypes

Date           datetime64[ns]
Description            object
Debit                 float64
Credit                float64
Balance               float64
dtype: object

##### SBI

In [14]:
df = pd.read_parquet(os.path.join(banksdirectory,files[2]))

In [23]:
df[pd.to_datetime(df['Date'],format="%d-%b-%y", errors='coerce', dayfirst=True).isna()]

Unnamed: 0,Date,Description,Debit,Credit,Balance
224,4 Apr 2021\t4 Apr 2021\t ATM WDL-ATM CASH 85...,858.59,,,
225,4 Apr 2021\t4 Apr 2021\t BY TRANSFER-INB IMP...,858.59,,,
226,5 Apr 2021\t5 Apr 2021\t DEBIT-REVR_RFND ...,648.83,,,
227,6 Apr 2021\t6 Apr 2021\t TO TRANSFER-UPI/DR/...,,,,
228,7 Apr 2021\t7 Apr 2021\t BY TRANSFER-UPI/CR/...,,,,
...,...,...,...,...,...
515,21 Mar 2023\t21 Mar 2023\t TO TRANSFER-UPI/D...,344.45,,,
516,25 Mar 2023\t25 Mar 2023\t CREDIT INTEREST--...,,,,
517,26 Mar 2023\t26 Mar 2023\t ATM WDL-ATM CASH ...,,,,
518,31 Mar 2023\t31 Mar 2023\t ATM WDL-ATM CASH ...,171.45,,,
