## SOURCE TO BRONZE LAYER

> This Notebook reads the RAW files and performs data harmonization.


In [None]:
import json

# Importing Common Utility Function
import pandas as pd

from StockETL.globalpath import GlobalPath

In [None]:
# Import necessary libraries and utility functions
%run ../COMMON/common_utility.ipynb

In [None]:
# Instantiate GlobalPath
tradehistory_source_layer_path = GlobalPath("DATA/SOURCE/TradeHistory")
tradehistory_bronze_schema_file_path = GlobalPath(
    "DATA/CONFIG/DATA_CONTRACTS/BRONZE/TradeHistory.json"
)
corporate_name_changes_config_path = GlobalPath(
    "DATA/CONFIG/CONSTANTS/corporate_name_changes.json"
)

### Define a function to read and process an Excel file


In [None]:
def read_file(file_path: GlobalPath) -> pd.DataFrame:
    """
    Reads and processes an Excel file from the specified file path.
    It performs data harmonization and saves the processed data as a CSV file.

    Args:
        file_path (Path): The path to the Excel file to be processed.
    """
    # Log the start of processing for the file
    print(f"\nProcessing => {file_path}")

    # Read the Excel file into a DataFrame
    df = pd.read_excel(
        file_path,
        engine="openpyxl",
        sheet_name=None,
        header=None,
        skipfooter=1,
    )

    # Find and select the correct sheetname containing "trade"
    df = find_correct_sheetname(df, sheet_name_regex="trade")

    # Find and set the correct headers matching "date"
    df = find_correct_headers(df, global_header_regex="date")

    # Replace punctuation from column names for consistency
    df = replace_punctuation_from_columns(df)

    # Fix duplicate column names by appending numerical suffixes
    df = fix_duplicate_column_names(df)

    # Drop rows where all elements are NaN
    df.dropna(how="all", inplace=True)

    # Align Datafame with DataContract
    df = align_with_datacontract(df, tradehistory_bronze_schema_file_path)

    return df

In [None]:
# Dictionary for corporate names overrides
# Open and read the JSON file
CORPORATE_NAME_CHANGE = {}
with open(corporate_name_changes_config_path, encoding="utf-8") as f:
    # Get the contract_fields from json data
    CORPORATE_NAME_CHANGE = {str(k).lower().strip(): v for k, v in json.load(f).items()}


def corporate_name_change_fixer(name: str) -> str:
    return CORPORATE_NAME_CHANGE.get(name.lower().strip(), name)

In [None]:
# Generate file paths for available Excel files in the source layer
file_paths = check_files_availability(
    tradehistory_source_layer_path, file_pattern="trade_*.xlsx"
)

# Process each file path
for file_path in file_paths:
    df = read_file(file_path)
    df["username"] = file_path.parent.name

    df["company"] = df["company"].apply(corporate_name_change_fixer)

    # Save the result as a CSV file in the bronze layer path
    output_filepath = GlobalPath(
        f"DATA/BRONZE/TradeHistory/{file_path.parent.name}/{file_path.name.replace("xlsx", "csv")}"
    )
    df.to_csv(output_filepath, index=None)
    # Log successful processing of the file
    print(f"Processed to => {output_filepath}")