# HDFC parser

### Layer 1

In [None]:
import camelot
import pandas as pd
from IPython.display import display
import json
import numpy as np


tables = camelot.read_pdf('/content/drive/MyDrive/SampleBankPdfs/AC/Hdfc-AC-Statement-p3.pdf', flavor='stream', pages="all", table_areas=['20,615,640,65'], columns=['60,281.1025000000001,351.1025,421.1025,500,571.446'])

print(f"Number of tables found: {len(tables)}")


def fix_narration(fixed_tables_len, data):
  df = pd.DataFrame(data)
  # print(df)

  # Initialize an empty list to hold the fixed rows
  fixed_rows = []

  # Variables to hold data while combining rows
  current_row = {}
  current_row[1] = ''

  # Iterate over DataFrame rows
  for index, row in df.iterrows():
      # Check if current row is a continuation of previous page's empty 'Date' column
      if index == 0 and (pd.isna(row[0]) or row[0] == '') and fixed_tables_len > 0:
          # Append narration to current row if it is a continuation
          fixed_tables[fixed_tables_len-1].iat[-1,0] += row[1]
      # Check if current row is a continuation (empty 'Date' column)
      elif pd.isna(row[0]) or row[0] == '':
          # Append narration to current row if it is a continuation
          current_row[1] += row[1]
      else:
          # If not a continuation, save the current_row and start a new one
          if current_row:
              fixed_rows.append(current_row)
          current_row = row.to_dict()

  # Append the last accumulated row
  if current_row:
      fixed_rows.append(current_row)

  # Convert fixed_rows back to DataFrame
  return pd.DataFrame(fixed_rows)


# Initialize a list to store the fixed DataFrames
fixed_tables = []

for table in tables:
  # camelot.plot(table, kind='contour').show()
  # camelot.plot(table, kind='grid').show()
  fixed_tables.append(fix_narration(len(fixed_tables), table.df)) # Append the fixed DataFrame to the list

# Concatenate all the fixed DataFrames into a single DataFrame
df = pd.concat(fixed_tables, ignore_index=True)


# Remove 'STATEMENT SUMMARY' from the last narration
last_row = df.iloc[-1]
if 'STATEMENT SUMMARY' in last_row[1]:
    df.at[df.index[-1], 1] = last_row[1].split('STATEMENT SUMMARY')[0].strip()


# Replace empty strings with NaN
df.replace('', np.nan, inplace=True)

# remove empty rows
df = df.dropna(how='all').reset_index(drop=True)


# Set the first row as the new column names
df.columns = df.iloc[0]

# Drop the first row
df = df[1:].reset_index(drop=True)
display(df)


# Save the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/SampleBankPdfs/outputs/hdfc_camelot_stream.csv', index=False)

### Layer 2

In [None]:
# Convert the DataFrame to JSON

df1 = df

df1['Date'] = pd.to_datetime(df1['Date'], format='%d/%m/%y').dt.strftime('%Y-%m-%d')

# 0. Add a new column "isDebit" instead of "transactionType"
df1['isDebit'] = df1['Withdrawal Amt.'].apply(lambda x: False if pd.isna(x) else True)

# 1. Add a new column "transactionType"
df1['transactionType'] = df1['Withdrawal Amt.'].apply(lambda x: 'income' if pd.isna(x) else 'expense')

# 2. Create a new column "amount"
# df1['amount'] = df1.apply(lambda row: row['Withdrawal Amt.'] if row['transactionType'] == 'expense' else row['Deposit Amt.'], axis=1)
df1['amount'] = df1.apply(lambda row: row['Withdrawal Amt.'] if row['isDebit'] else row['Deposit Amt.'], axis=1)

# Convert the "amount" column to numeric (handle commas and convert to float)
df1['amount'] = df1['amount'].replace(',', '', regex=True).astype(float)
df1['Closing Balance'] = df1['Closing Balance'].replace(',', '', regex=True).astype(float)

# 3. Remove the columns "Withdrawal Amt." and "Deposit Amt."
df1.drop(columns=['Withdrawal Amt.', 'Deposit Amt.', 'Value Dt', 'Chq./Ref.No.'], inplace=True)

# 4. Rename the columns
df1.rename(columns={'Date': 'date', 'Narration': 'description', 'Closing Balance':'balance'}, inplace=True)


# HOW TO IDENTIFY NOTES
# Paid to ABC
# Received from ABC


# print(df1)


data_dict = df1.to_dict(orient='records')
json_data = json.dumps(data_dict, ensure_ascii=False).replace('\\/', '/').replace('NaN', 'null')

print(json_data)