## SOURCE TO BRONZE LAYER

### Process

1. **Determine Current Year**: The function calculates the current year to use as the end date for data fetching.
2. **Generate Year List**: Creates a list of years from the specified start year (`from_year`) to the current year.
3. **Data Fetching and Processing**:
   - For each year in the list, the function fetches daily stock data using the Yahoo Finance API (`yfinance`).
   - If data is retrieved successfully, it is processed by:
     - Resetting the index to include the date as a column.
     - Replacing punctuation in column names for consistency.
     - Fixing duplicate column names by appending numerical suffixes.
   - If no data is fetched for a year, a warning is logged.
4. **Directory and File Handling**:
   - Ensures that a directory for each year exists or creates it if necessary.
   - Saves the processed data to a CSV file in the appropriate year-specific directory.
5. **Error Handling**: Logs any errors encountered during the fetching, processing, or saving of data.


In [101]:
# Import necessary libraries and utility functions
from datetime import datetime, timedelta

import pandas as pd
import yfinance as yf

from PortfolioTracker.globalpath import global_path
from PortfolioTracker.logger import logger
from PortfolioTracker.utilities import (
    fix_duplicate_column_names,
    replace_punctuation_from_columns,
)

In [102]:
def generate_month_list(start_date, end_date):
    logger.info(f"{start_date}, {end_date}")
    # Initialize list to store month list
    month_list = []
    # Loop through each month from from_date to current_date
    while start_date <= end_date:
        # Append year and month as a tuple
        month_list.append((start_date.year, start_date.month))
        # Move to the next month
        # If the current month is December, increment the year and reset the month to January
        if start_date.month == 12:
            start_date = start_date.replace(
                year=start_date.year + 1, month=1, day=1
            )
        else:
            start_date = start_date.replace(month=start_date.month + 1, day=1)

    logger.info(month_list)
    return month_list


def get_first_and_last_date(year, month):
    # First day of the month
    first_date = datetime(year, month, 1)

    # Calculate the last day of the month by adding one month and subtracting a day
    if month == 12:
        # Handle December, which needs to roll over to January of the next year
        last_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        # Normal case for months other than December
        last_date = datetime(year, month + 1, 1) - timedelta(days=1)

    # Ensure last_date does not exceed today's date
    today = datetime.today()
    if last_date > today:
        last_date = today

    # .strftime("%Y-%m-%d")

    return first_date, last_date


# start_date, end_date = get_first_and_last_date(2024,8)
# end_date < datetime.today()

In [103]:
def process_file(isin_no: str, stock_name: str, month_list: list) -> None:
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.
    """
    # Log the start of processing for the given symbol
    logger.info(f"\n\nStarting data processing for symbol: {stock_name}")

    # Fetch historical stock data for each year
    for year, month in month_list:
        try:
            start_date, end_date = get_first_and_last_date(year, month)

            # Determine the output file path in the bronze layer
            output_file = global_path.make_path(
                f"BRONZE/StockData/{year}/{month}/{stock_name}.csv"
            )

            # Check if the file exists
            if end_date < datetime.today() and output_file.exists():
                continue

            # Fetch historical data from Yahoo Finance for the specified year
            stock = yf.Ticker(isin_no)
            df = stock.history(start=start_date, end=end_date, interval="1d")

            # Check if the DataFrame is empty
            if df.empty:
                logger.warning(
                    f"No data fetched for symbol: {stock_name} [{stock_name}] for year: {year} and month : {month}"
                )
                continue

            # Reset the index to ensure date is a column
            df = df.reset_index()

            # Replace punctuation in column names for consistency
            df = replace_punctuation_from_columns(df)

            # Fix duplicate column names by appending numerical suffixes
            df = fix_duplicate_column_names(df)

            # Save the processed DataFrame to a CSV file
            df.to_csv(output_file, index=False)

            # Log successful processing and saving of data
            logger.info(
                f"Data for {stock_name} successfully processed and saved to {output_file}"
            )

        except Exception as e:
            # Log any errors encountered during processing
            logger.error(
                f"Error processing data for {stock_name} for year {year}: {e}"
            )

In [104]:
df_stock_holding_records = pd.read_csv(
    global_path.stock_holding_records_file_path
)
df_stock_holding_records["min_date"] = pd.to_datetime(
    df_stock_holding_records["min_date"]
)
df_stock_holding_records["max_date"] = pd.to_datetime(
    df_stock_holding_records["max_date"]
)

for index, row in df_stock_holding_records.iterrows():
    process_file(
        isin_no=row["isin_no"],
        stock_name=row["symbol"],
        month_list=generate_month_list(row["min_date"], row["max_date"]),
    )

2024-08-10T03:31:58Z - INFO - 2020-05-05 00:00:00, 2020-06-14 00:00:00
2024-08-10T03:31:58Z - INFO - [(2020, 5), (2020, 6)]
2024-08-10T03:31:58Z - INFO - 

Starting data processing for symbol: BHAGERIA
2024-08-10T03:31:58Z - INFO - 2021-08-06 00:00:00, 2022-08-09 00:00:00
2024-08-10T03:31:58Z - INFO - [(2021, 8), (2021, 9), (2021, 10), (2021, 11), (2021, 12), (2022, 1), (2022, 2), (2022, 3), (2022, 4), (2022, 5), (2022, 6), (2022, 7), (2022, 8)]
2024-08-10T03:31:58Z - INFO - 

Starting data processing for symbol: BPCL
2024-08-10T03:31:58Z - INFO - 2021-02-09 00:00:00, 2021-06-03 00:00:00
2024-08-10T03:31:58Z - INFO - [(2021, 2), (2021, 3), (2021, 4), (2021, 5), (2021, 6)]
2024-08-10T03:31:58Z - INFO - 

Starting data processing for symbol: GOLDBEES
2024-08-10T03:31:58Z - DEBUG - Starting new HTTPS connection (1): query2.finance.yahoo.com:443
2024-08-10T03:31:58Z - DEBUG - https://query2.finance.yahoo.com:443 "GET /v1/finance/search?q=INF204KB17I5 HTTP/1.1" 200 2223
2024-08-10T03:31:58Z

In [107]:
list(global_path.stockdata_bronze_layer_path.glob("**/*.csv"))

[WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/10/IDEA.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/10/PNB.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/10/TATAMOTORS.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/10/YESBANK.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/11/IDEA.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/11/PNB.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/11/TATAMOTORS.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/11/YESBANK.csv'),
 WindowsPath('C:/Users/prashant.tripathi/Code/PortfolioTracker/DATA/BRONZE/StockData/2020/12/IDEA.csv'),
 WindowsPath('C:/Users/prashant.tripath