In [1]:
import sys
import os
import pandas as pd
import pdfplumber
from src.exception import CustomException
from src.ETL_manager import DataIngestor
from src.ETL_manager import DataProcessor
from src.inputs import mapping
from src.inputs.column_structure import get_cols
import warnings
# from src.logger import logging
import logging
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

class DataTransformer():
    def __init__(self, input_file:str, col_name:list, data_name:str, 
                 transaction_col_name:str):
        self.input_file = os.getcwd()+input_file
        input_file = os.getcwd()+input_file
        self.col_name = col_name
        self.data_name = data_name
        self.transaction_col_name = transaction_col_name
    
    def transform_data_to_df(self):
        df = DataIngestor.read_statement(file_path=self.input_file, 
                                         col_names=self.col_name
                                         )
        logging.info(f'Data Ingestion Completed..................')

        df = DataIngestor.clean_features(data=df, 
                                         data_name=self.data_name
                                         )
        logging.info(f'Data Cleaning Completed...................')

        df = DataProcessor.remap_transactions(df, 
                                              transaction_col_name=self.transaction_col_name
                                              )
        logging.info(f'Data Remap Transaction Completed..........')

        df = DataProcessor.map_expense_group(df)
        logging.info(f'Mapping Expense Group.....................')
    
        df = DataProcessor.__rearrange_cols__(df)
        logging.info(f'Rearrange Columns.......')

        return df

data_trasformer = DataTransformer(input_file='/raw_file/915010018577756.csv',
                                  col_name=get_cols.col1(),
                                  data_name='Expense',
                                  transaction_col_name='Particulars'
                                  )

transform_df = data_trasformer.transform_data_to_df()

[ 2024-10-19 18:48:07,581 ] 17 root - INFO - Determining the file extension
[ 2024-10-19 18:48:07,582 ] 19 root - INFO - File extension found: .csv
[ 2024-10-19 18:48:07,582 ] 23 root - INFO - Reading file.................
[ 2024-10-19 18:48:07,587 ] 35 root - INFO - Finding the footer index...............
[ 2024-10-19 18:48:07,588 ] 28 root - INFO - Data Ingestion Completed..................
[ 2024-10-19 18:48:07,588 ] 89 root - INFO - Dataset Name is - Expense, dropping the unmatched columns accordingly
[ 2024-10-19 18:48:07,589 ] 67 root - INFO - Dropping the column CHQ No
  data[['Debit', 'Credit', 'Balance']] = data[['Debit', 'Credit', 'Balance']].apply(pd.to_numeric, errors='ignore')
[ 2024-10-19 18:48:07,597 ] 33 root - INFO - Data Cleaning Completed...................
[ 2024-10-19 18:48:07,598 ] 126 root - INFO - Remapping Transactions............
[ 2024-10-19 18:48:07,601 ] 38 root - INFO - Data Remap Transaction Completed..........
[ 2024-10-19 18:48:07,602 ] 159 root - INFO 

In [2]:
# # HDFC Bank
# input_file = r'/raw_file/hdfc_statement.xlsx'
# input_file = os.getcwd()+input_file
# col_names_2 = get_cols.col2()
# df_1  = DataIngestor.read_statement(input_file, col_names=col_names_2)

In [3]:
transform_df

Unnamed: 0,Date,Particulars,exp_name,exp_maps,Debit,Credit,Balance
0,19-09-2024,Mr S Chandran,,FoodRegular,40.00,,443.64
1,19-09-2024,MANIGANDAN,,Flower,20.00,,423.64
2,19-09-2024,ASHRAF U K,,Grocery Regular,30.00,,393.64
3,19-09-2024,HungerBox Hunger,Hunger,Office Food,82.00,,311.64
4,19-09-2024,RAMBABU YADAV,,Others,10.00,,301.64
...,...,...,...,...,...,...,...
59,24-09-2024,HungerBox Hunger,Hunger,Office Food,53.00,,178.14
60,24-09-2024,RAMBABU YADAV,,Others,10.00,,168.14
61,24-09-2024,RAM MEDICAL medici,medici,Medical,25.00,,143.14
62,24-09-2024,sri annapoorneshwary tea,tea,Tea & Others,30.00,,113.14


In [4]:
transform_df.dtypes

Date            object
Particulars     object
exp_name        object
exp_maps        object
Debit           object
Credit          object
Balance        float64
dtype: object

In [5]:
data_trasformer = DataTransformer(input_file='/raw_file/hdfc_statement.xlsx',
                                  col_name=get_cols.col2(),
                                  data_name='Salary',
                                  transaction_col_name='Particulars'
                                  )

transform_df = data_trasformer.transform_data_to_df()

[ 2024-10-19 18:48:15,600 ] 17 root - INFO - Determining the file extension
[ 2024-10-19 18:48:15,600 ] 19 root - INFO - File extension found: .xlsx
[ 2024-10-19 18:48:15,601 ] 27 root - INFO - Reading file.................
[ 2024-10-19 18:48:15,713 ] 35 root - INFO - Finding the footer index...............
[ 2024-10-19 18:48:15,713 ] 28 root - INFO - Data Ingestion Completed..................
[ 2024-10-19 18:48:15,714 ] 93 root - INFO - Dataset Name is - Salary, dropping the unmatched columns accordingly
[ 2024-10-19 18:48:15,715 ] 61 root - INFO - Null values found......Dropping the Null values......
[ 2024-10-19 18:48:15,716 ] 64 root - INFO - * found in Date column... Dropping rows with *.
[ 2024-10-19 18:48:15,717 ] 67 root - INFO - Dropping the column CHQ No
  data[['Debit', 'Credit', 'Balance']] = data[['Debit', 'Credit', 'Balance']].apply(pd.to_numeric, errors='ignore')
[ 2024-10-19 18:48:15,718 ] 33 root - INFO - Data Cleaning Completed...................
[ 2024-10-19 18:48:15

In [6]:
transform_df.dtypes

Date            object
Particulars     object
exp_name        object
exp_maps        object
Debit          float64
Credit         float64
Balance        float64
dtype: object