## SOURCE TO BRONZE LAYER

### Process

1. **Determine Current Year**: The function calculates the current year to use as the end date for data fetching.
2. **Generate Year List**: Creates a list of years from the specified start year (`from_year`) to the current year.
3. **Data Fetching and Processing**:
   - For each year in the list, the function fetches daily stock data using the Yahoo Finance API (`yfinance`).
   - If data is retrieved successfully, it is processed by:
     - Resetting the index to include the date as a column.
     - Replacing punctuation in column names for consistency.
     - Fixing duplicate column names by appending numerical suffixes.
   - If no data is fetched for a year, a warning is logged.
4. **Directory and File Handling**:
   - Ensures that a directory for each year exists or creates it if necessary.
   - Saves the processed data to a CSV file in the appropriate year-specific directory.
5. **Error Handling**: Logs any errors encountered during the fetching, processing, or saving of data.


In [1]:
# Import necessary libraries and utility functions
from datetime import datetime
import yfinance as yf
from common.utilities import (
    fix_duplicate_column_names,
    global_path,
    replace_punctuation_from_columns,
    logger,
)

In [2]:
def process_file(symbol: str, filename: str, from_year: int) -> None:
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.

    Parameters:
    - symbol (str): The stock symbol to fetch data for.
    - filename (str): The filename to save the processed data as.
    - from_year (int): The start year for which to fetch data.
    """
    # Determine the current year
    current_year = datetime.now().year

    # Generate a list of years from the given year to the current year
    years = list(range(from_year, current_year + 1))

    # Log the start of processing for the given symbol
    logger.info(f"Starting data processing for symbol: {symbol}, Years: {years}")

    # Fetch historical stock data for each year
    for yr in years:
        start_date = f"{yr}-01-01"
        end_date = f"{yr}-12-31" if yr < current_year else datetime.now().strftime("%Y-%m-%d")

        try:
            # Fetch historical data from Yahoo Finance for the specified year
            stock = yf.Ticker(symbol)
            df = stock.history(start=start_date, end=end_date, interval="1d")

            # Check if the DataFrame is empty
            if df.empty:
                logger.warning(f"No data fetched for symbol: {symbol} for year: {yr}")
                continue

            # Reset the index to ensure date is a column
            df = df.reset_index()

            # Replace punctuation in column names for consistency
            df = replace_punctuation_from_columns(df)

            # Fix duplicate column names by appending numerical suffixes
            df = fix_duplicate_column_names(df)

            # Ensure the directory exists
            year_dir = global_path.stockdata_bronze_layer_path.joinpath(str(yr))
            year_dir.mkdir(parents=True, exist_ok=True)

            # Determine the output file path in the bronze layer
            output_file = year_dir.joinpath(filename)

            # Save the processed DataFrame to a CSV file
            df.to_csv(output_file, index=False)

            # Log successful processing and saving of data
            logger.info(f"Data for {symbol} successfully processed and saved to {output_file}")

        except Exception as e:
            # Log any errors encountered during processing
            logger.error(f"Error processing data for {symbol} for year {yr}: {e}")

In [3]:
# Dictionary mapping stock symbols to output filenames
symbols_to_files = {
    "0P00017844.BO": "MIRAE-ASSET-TAX-SAVER-DIRECT-GROWTH.MF.csv",
    "0P0000XVL9.BO": "SBI-MAGNUM-TAXGAIN-SCHEME-DIR-GR.MF.csv",
    "BHAGERIA.NS": "BHAGERIA.NS.csv",
    "BPCL.NS": "BPCL.NS.csv",
    "GOLDBEES.NS": "GOLDBEES.NS.csv",
    "HERANBA.NS": "HERANBA.NS.csv",
    "IDEA.NS": "IDEA.NS.csv",
    "INFY.NS": "INFY.NS.csv",
    "IRCTC.NS": "IRCTC.NS.csv",
    "KPITTECH.NS": "KPITTECH.NS.csv",
    "LICI.NS": "LICI.NS.csv",
    "NIFTYBEES.NS": "NIFTYBEES.NS.csv",
    "PNB.NS": "PNB.NS.csv",
    "SBIN.NS": "SBIN.NS.csv",
    "TATACHEM.NS": "TATACHEM.NS.csv",
    "TATAMOTORS.NS": "TATAMOTORS.NS.csv",
    "TATAPOWER.NS": "TATAPOWER.NS.csv",
    "VOLTAS.NS": "VOLTAS.NS.csv",
    "YESBANK.NS": "YESBANK.NS.csv",
}

# Iterate over each symbol and corresponding filename
for symbol, filename in symbols_to_files.items():
    process_file(symbol, filename, from_year = datetime.now().year)

2024-08-07T12:44:13Z - INFO - Starting data processing for symbol: 0P00017844.BO, Years: [2020, 2021, 2022, 2023, 2024]
2024-08-07T12:44:13Z - DEBUG - Entering history()
2024-08-07T12:44:13Z - DEBUG - 0P00017844.BO: Yahoo GET parameters: {'period1': '2020-01-01 00:00:00+05:30', 'period2': '2020-12-31 00:00:00+05:30', 'interval': '1d', 'includePrePost': False, 'events': 'div,splits,capitalGains'}
2024-08-07T12:44:13Z - DEBUG - Starting new HTTPS connection (1): query2.finance.yahoo.com:443
2024-08-07T12:44:13Z - DEBUG - https://query2.finance.yahoo.com:443 "GET /v8/finance/chart/0P00017844.BO?period1=1577817000&period2=1609353000&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains HTTP/1.1" 200 None
2024-08-07T12:44:13Z - DEBUG - 0P00017844.BO: yfinance received OHLC data: 2020-01-01 03:45:00 -> 2020-12-30 03:45:00
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
2024-08-07T12:44:13Z - DEBUG - 0P00017844.BO: OHLC after cleaning: 2020-01-01 09:15:00+05:30 -> 2020-12