## SOURCE TO BRONZE LAYER

### Process

1. **Determine Current Year**: The function calculates the current year to use as the end date for data fetching.
2. **Generate Year List**: Creates a list of years from the specified start year (`from_year`) to the current year.
3. **Data Fetching and Processing**:
   - For each year in the list, the function fetches daily stock data using the Yahoo Finance API (`yfinance`).
   - If data is retrieved successfully, it is processed by:
     - Resetting the index to include the date as a column.
     - Replacing punctuation in column names for consistency.
     - Fixing duplicate column names by appending numerical suffixes.
   - If no data is fetched for a year, a warning is logged.
4. **Directory and File Handling**:
   - Ensures that a directory for each year exists or creates it if necessary.
   - Saves the processed data to a CSV file in the appropriate year-specific directory.
5. **Error Handling**: Logs any errors encountered during the fetching, processing, or saving of data.


In [1]:
# Import necessary libraries and utility functions

import os
from datetime import datetime, timedelta

import pandas as pd
import yfinance as yf
from dotenv import load_dotenv

from PortfolioTracker.globalpath import GlobalPath
from PortfolioTracker.utilities import (
    fix_duplicate_column_names,
    replace_punctuation_from_columns,
)

In [2]:
# Instantiate GlobalPath
global_path = GlobalPath()
# GLOBAL PATH
stock_holding_records_file_path = global_path.joinpath(
    "DATA/GOLD/Holdings/HoldingsRecords_data.csv"
)
stockdata_bronze_layer_path = global_path.joinpath("DATA/BRONZE/StockData")

In [3]:
def generate_month_list(start_date, end_date):
    # Initialize list to store month list
    month_list = []
    # Loop through each month from from_date to current_date
    while start_date <= end_date:
        # Append year and month as a tuple
        month_list.append((start_date.year, start_date.month))
        # Move to the next month
        # If the current month is December, increment the year and reset the month to January
        if start_date.month == 12:
            start_date = start_date.replace(
                year=start_date.year + 1, month=1, day=1
            )
        else:
            start_date = start_date.replace(month=start_date.month + 1, day=1)

    print(month_list)
    return month_list


def get_first_and_last_date(year, month):
    # First day of the month
    first_date = datetime(year, month, 1)

    # Calculate the last day of the month by adding one month and subtracting a day
    if month == 12:
        # Handle December, which needs to roll over to January of the next year
        last_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        # Normal case for months other than December
        last_date = datetime(year, month + 1, 1) - timedelta(days=1)

    # Ensure last_date does not exceed today's date
    today = datetime.today()
    if last_date > today:
        last_date = today
    return first_date, last_date

In [4]:
def process_file(isin: str, symbol: str, month_list: list) -> None:
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.
    """
    # Log the start of processing for the given symbol
    print(f"\n\nStarting data processing for symbol: {symbol}")

    # Fetch historical stock data for each year
    for year, month in month_list:
        try:
            start_date, end_date = get_first_and_last_date(year, month)

            # Determine the output file path in the bronze layer
            output_file = stockdata_bronze_layer_path.joinpath(
                f"{year:04d}/{month:02d}/{symbol}.csv"
            )

            # Check if the file exists
            if end_date < datetime.today() and output_file.exists():
                continue

            # Fetch historical data from Yahoo Finance for the specified year
            stock = yf.Ticker(isin)
            df = stock.history(start=start_date, end=end_date, interval="1d")

            # Check if the DataFrame is empty
            if df.empty:
                print(
                    f"No data fetched for symbol: {symbol} for year: {year} and month : {month}"
                )
                continue

            # Reset the index to ensure date is a column
            df = df.reset_index()

            # Replace punctuation in column names for consistency
            df = replace_punctuation_from_columns(df)

            # Fix duplicate column names by appending numerical suffixes
            df = fix_duplicate_column_names(df)

            # Round numerical values to 2 decimal places
            df = df.round(2)

            # Save the processed DataFrame to a CSV file
            df.to_csv(output_file, index=False)

            # Log successful processing and saving of data
            print(
                f"Data for {symbol} successfully processed and saved to {output_file}"
            )

        except Exception as e:
            # Log any errors encountered during processing
            print(f"Error processing data for {symbol} for year {year}: {e}")

In [5]:
def run():
    """
    Main function to run the data processing pipeline.
    """
    df_stock_holding_records = pd.read_csv(stock_holding_records_file_path)
    df_stock_holding_records["min_date"] = pd.to_datetime(
        df_stock_holding_records["min_date"]
    )
    df_stock_holding_records["max_date"] = pd.to_datetime(
        df_stock_holding_records["max_date"]
    )

    for _, row in df_stock_holding_records.iterrows():
        process_file(
            isin=row["isin"],
            symbol=row["symbol"],
            month_list=generate_month_list(row["min_date"], row["max_date"]),
        )

In [6]:
# Load environment variables
load_dotenv()

# Early exit if the SKIP_STOCK_DATA environment variable is set
if os.getenv("SKIP_STOCK_DATA", "FALSE").upper() == "TRUE":
    print("Skipping data processing as per environment configuration.")
elif not stock_holding_records_file_path.is_file():
    print(
        f"Stock holding records file does not exist: {stock_holding_records_file_path}"
    )
else:
    print("Running data processing pipeline...")
    run()

Skipping data processing as per environment configuration.
