## SOURCE TO BRONZE LAYER

> This Notebook reads the RAW files and performs data harmonization.

In [20]:
# Import necessary libraries and utility functions
import pandas as pd
import pathlib
from common_utilities import (
    check_files_availability,
    find_correct_headers,
    find_correct_sheetname,
    fix_duplicate_column_names,
    global_path,
    replace_punctuation_from_columns,
    logger
)

In [21]:
# Define a function to read and process an Excel file
def read_file(file_path: pathlib.Path) -> None:
    """
    Reads and processes an Excel file from the specified file path.
    It performs data harmonization and saves the processed data as a CSV file.

    Args:
        file_path (pathlib.Path): The path to the Excel file to be processed.
    """
    # Log the start of processing for the file
    logger.info(f"Processing => {file_path}")

    try:
        # Read the Excel file into a DataFrame
        df = pd.read_excel(
            file_path, engine="openpyxl", sheet_name=None, header=None, skipfooter=1
        )

        # Find and select the correct sheetname containing "trade"
        df = find_correct_sheetname(df, sheet_name_regex="trade")

        # Find and set the correct headers matching "date"
        df = find_correct_headers(df, global_header_regex="date")

        # Replace punctuation from column names for consistency
        df = replace_punctuation_from_columns(df)

        # Fix duplicate column names by appending numerical suffixes
        df = fix_duplicate_column_names(df)

        # Drop rows where all elements are NaN
        df = df.dropna(how="all")

        # Save the result as a CSV file in the bronze layer path
        output_file = global_path.tradehistory_bronze_layer_path.joinpath(
            file_path.name.replace("xlsx", "csv")
        )
        df.to_csv(output_file, index=None)

        # Log successful processing of the file
        logger.info(f"Processed => {output_file}")

    except Exception as e:
        # Log any exceptions that occur during processing
        logger.error(f"Failed to process {file_path} due to error: {e}")

In [22]:
# Generate file paths for available Excel files in the source layer
file_paths = check_files_availability(
    global_path.tradehistory_source_layer_path,
    file_pattern="trade_*.xlsx",
)

# Process each file path
for file_path in file_paths:
    read_file(file_path)

2024-08-01T11:38:42Z - INFO - Number of Files Detected: 5
2024-08-01T11:38:42Z - INFO - Processing => C:\Users\prashant.tripathi\Code\Upstox\DATA\SOURCE\TradeHistory\trade_2021.xlsx
2024-08-01T11:38:42Z - INFO - Sheet name => TRADE
2024-08-01T11:38:42Z - INFO - Processed => C:\Users\prashant.tripathi\Code\Upstox\DATA\BRONZE\TradeHistory\trade_2021.csv
2024-08-01T11:38:42Z - INFO - Processing => C:\Users\prashant.tripathi\Code\Upstox\DATA\SOURCE\TradeHistory\trade_2122.xlsx
2024-08-01T11:38:42Z - INFO - Sheet name => TRADE
2024-08-01T11:38:42Z - INFO - Processed => C:\Users\prashant.tripathi\Code\Upstox\DATA\BRONZE\TradeHistory\trade_2122.csv
2024-08-01T11:38:42Z - INFO - Processing => C:\Users\prashant.tripathi\Code\Upstox\DATA\SOURCE\TradeHistory\trade_2223.xlsx
2024-08-01T11:38:42Z - INFO - Sheet name => TRADE
2024-08-01T11:38:42Z - INFO - Processed => C:\Users\prashant.tripathi\Code\Upstox\DATA\BRONZE\TradeHistory\trade_2223.csv
2024-08-01T11:38:42Z - INFO - Processing => C:\Users\p

<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 9 to 26
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             18 non-null     object
 1   company          18 non-null     object
 2   amount           18 non-null     object
 3   exchange         18 non-null     object
 4   segment          18 non-null     object
 5   scrip_code       18 non-null     object
 6   instrument_type  18 non-null     object
 7   strike_price     18 non-null     object
 8   expiry           0 non-null      object
 9   trade_num        17 non-null     object
 10  trade_time       17 non-null     object
 11  side             18 non-null     object
 12  quantity         18 non-null     object
 13  price            18 non-null     object
dtypes: object(14)
memory usage: 2.1+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 9 to 22
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype