## SOURCE TO BRONZE LAYER

> This Notebook reads the RAW files and performs data harmonization.


In [1]:
# Import necessary libraries and utility functions
import yfinance as yf
from common.utilities import (
    fix_duplicate_column_names,
    global_path,
    replace_punctuation_from_columns,
    logger,
)

In [2]:
def process_file(symbol: str, filename: str) -> None:
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.

    Parameters:
    - symbol (str): The stock symbol to fetch data for.
    - filename (str): The filename to save the processed data as.
    """

    # Log the start of processing for the given symbol
    logger.info(f"Starting data processing for symbol: {symbol}")

    # Fetch historical stock data from Yahoo Finance
    stock = yf.Ticker(symbol)
    df = stock.history(start="2020-01-01", interval="1d")

    # Check if the DataFrame is empty
    if df.empty:
        logger.warning(f"No data fetched for symbol: {symbol}")
        return

    # Reset the index to ensure date is a column
    df = df.reset_index()

    # Replace punctuation in column names for consistency
    df = replace_punctuation_from_columns(df)

    # Fix duplicate column names by appending numerical suffixes
    df = fix_duplicate_column_names(df)

    # Determine the output file path in the bronze layer
    output_file = global_path.stockdata_bronze_layer_path.joinpath(filename)

    # Save the processed DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    # Log successful processing and saving of data
    logger.info(
        f"Data for {symbol} successfully processed and saved to {output_file}"
    )

In [3]:
# Dictionary mapping stock symbols to output filenames
symbols_to_files = {
    "0P00017844.BO": "MIRAE-ASSET-TAX-SAVER-DIRECT-GROWTH.MF.csv",
    "0P0000XVL9.BO": "SBI-MAGNUM-TAXGAIN-SCHEME-DIR-GR.MF.csv",
    "BHAGERIA.NS": "BHAGERIA.NS.csv",
    "BPCL.NS": "BPCL.NS.csv",
    "GOLDBEES.NS": "GOLDBEES.NS.csv",
    "HERANBA.NS": "HERANBA.NS.csv",
    "IDEA.NS": "IDEA.NS.csv",
    "INFY.NS": "INFY.NS.csv",
    "IRCTC.NS": "IRCTC.NS.csv",
    "KPITTECH.NS": "KPITTECH.NS.csv",
    "LICI.NS": "LICI.NS.csv",
    "NIFTYBEES.NS": "NIFTYBEES.NS.csv",
    "PNB.NS": "PNB.NS.csv",
    "SBIN.NS": "SBIN.NS.csv",
    "TATACHEM.NS": "TATACHEM.NS.csv",
    "TATAMOTORS.NS": "TATAMOTORS.NS.csv",
    "TATAPOWER.NS": "TATAPOWER.NS.csv",
    "VOLTAS.NS": "VOLTAS.NS.csv",
    "YESBANK.NS": "YESBANK.NS.csv",
}

# Iterate over each symbol and corresponding filename
for symbol, filename in symbols_to_files.items():
    try:
        process_file(symbol, filename)
    except Exception as e:
        # Log any errors encountered during processing
        logger.error(f"Error processing data for {symbol}: {e}")

2024-08-07T11:43:14Z - INFO - Starting data processing for symbol: 0P00017844.BO
2024-08-07T11:43:14Z - DEBUG - Entering history()
2024-08-07T11:43:15Z - DEBUG - 0P00017844.BO: Yahoo GET parameters: {'period1': '2020-01-01 00:00:00+05:30', 'period2': '2024-08-07 11:43:15+05:30', 'interval': '1d', 'includePrePost': False, 'events': 'div,splits,capitalGains'}
2024-08-07T11:43:15Z - DEBUG - Starting new HTTPS connection (1): query2.finance.yahoo.com:443
2024-08-07T11:43:15Z - DEBUG - https://query2.finance.yahoo.com:443 "GET /v8/finance/chart/0P00017844.BO?period1=1577817000&period2=1723011195&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains HTTP/1.1" 200 None
2024-08-07T11:43:15Z - DEBUG - 0P00017844.BO: yfinance received OHLC data: 2020-01-01 03:45:00 -> 2024-08-07 03:45:00
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
2024-08-07T11:43:15Z - DEBUG - 0P00017844.BO: OHLC after cleaning: 2020-01-01 09:15:00+05:30 -> 2024-08-07 09:15:00+05:30
2024-08-07T11:43:15Z