## SOURCE TO BRONZE LAYER

### Process

1. **Determine Current Year**: The function calculates the current year to use as the end date for data fetching.
2. **Generate Year List**: Creates a list of years from the specified start year (`from_year`) to the current year.
3. **Data Fetching and Processing**:
   - For each year in the list, the function fetches daily stock data using the Yahoo Finance API (`yfinance`).
   - If data is retrieved successfully, it is processed by:
     - Resetting the index to include the date as a column.
     - Replacing punctuation in column names for consistency.
     - Fixing duplicate column names by appending numerical suffixes.
   - If no data is fetched for a year, a warning is logged.
4. **Directory and File Handling**:
   - Ensures that a directory for each year exists or creates it if necessary.
   - Saves the processed data to a CSV file in the appropriate year-specific directory.
5. **Error Handling**: Logs any errors encountered during the fetching, processing, or saving of data.


In [2]:
# Import necessary libraries and utility functions
import os
import pandas as pd
import yfinance as yf
from dotenv import load_dotenv
from MyModules.globalpath import GlobalPath
from MyModules.portfolio.utilities import (
    fix_duplicate_column_names,
    replace_punctuation_from_columns,
)

In [3]:
# Instantiate GlobalPath
global_path = GlobalPath()
# GLOBAL PATH
stock_holding_records_file_path = global_path.joinpath(
    "DATA/GOLD/Holdings/HoldingsRecords_data.csv"
)
stockdata_bronze_layer_path = global_path.joinpath("DATA/BRONZE/StockData")

In [5]:
def generate_month_list(start_date, end_date):
    month_list = []
    current_date = min(start_date, datetime.today())
    end_date = min(end_date, datetime.today())
    while current_date <= end_date:
        month_list.append(DateTimeUtil(current_date))
        if current_date.month == 12:
            current_date = current_date.replace(
                year=current_date.year + 1, month=1, day=1
            )
        else:
            current_date = current_date.replace(
                month=current_date.month + 1, day=1
            )
    return month_list

In [6]:
def process_file(isin: str, symbol: str, month_list: List[DateTimeUtil]):
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.
    """
    # Log the start of processing for the given symbol
    print(f"\n\nStarting data processing for symbol: {symbol}")

    # Fetch historical stock data for each year
    for each in month_list:
        try:
            print(each)
            # Determine the output file path in the bronze layer
            output_file = stockdata_bronze_layer_path.joinpath(
                f"{each.year:04d}/{each.month:02d}/{symbol}.csv"
            )

            # Check if the file exists
            if output_file.exists():
                if each.month_difference > 1:
                    continue
                elif each.month_difference == 1:
                    continue
                else:
                    pass

            continue
            # Fetch historical data from Yahoo Finance for the specified year
            stock = yf.Ticker(isin)
            df = stock.history(
                start=each.start_date,
                end=min(each.end_date, datetime.today()),
                interval="1d",
            )

            # Check if the DataFrame is empty
            if df.empty:
                print(
                    f"No data fetched for symbol: {symbol} for year: {each.year} and month : {each.month}"
                )
                continue

            # Reset the index to ensure date is a column
            df = df.reset_index()

            # Replace punctuation in column names for consistency
            df = replace_punctuation_from_columns(df)

            # Fix duplicate column names by appending numerical suffixes
            df = fix_duplicate_column_names(df)

            # Round numerical values to 2 decimal places
            df = df.round(2)

            # Save the processed DataFrame to a CSV file
            df.to_csv(output_file, index=False)

            # Log successful processing and saving of data
            print(
                f"Data for {symbol} successfully processed and saved to {output_file}"
            )

        except Exception as e:
            # Log any errors encountered during processing
            print(
                f"Error processing data for {symbol} year: {each.year} and month : {each.month}"
            )

NameError: name 'List' is not defined

In [None]:
def run():
    """
    Main function to run the data processing pipeline.
    """
    df_stock_holding_records = pd.read_csv(stock_holding_records_file_path)
    df_stock_holding_records["min_date"] = pd.to_datetime(
        df_stock_holding_records["min_date"]
    )
    df_stock_holding_records["max_date"] = pd.to_datetime(
        df_stock_holding_records["max_date"]
    )

    for _, row in df_stock_holding_records.iterrows():
        process_file(
            isin=row["isin"],
            symbol=row["symbol"],
            month_list=generate_month_list(row["min_date"], row["max_date"]),
        )

In [None]:
# Load environment variables
load_dotenv()

# # Early exit if the SKIP_STOCK_DATA environment variable is set
# if os.getenv("SKIP_STOCK_DATA", "FALSE").upper() == "TRUE":
#     print("Skipping data processing as per environment configuration.")
# elif not stock_holding_records_file_path.is_file():
#     print(
#         f"Stock holding records file does not exist: {stock_holding_records_file_path}"
#     )
# else:
#     print("Running data processing pipeline...")
run()



Starting data processing for symbol: BHAGERIA
DateUtil(year=2020, month=5)
DateUtil(year=2020, month=6)


Starting data processing for symbol: BPCL
DateUtil(year=2021, month=8)
DateUtil(year=2021, month=9)
DateUtil(year=2021, month=10)
DateUtil(year=2021, month=11)
DateUtil(year=2021, month=12)
DateUtil(year=2022, month=1)
DateUtil(year=2022, month=2)
DateUtil(year=2022, month=3)
DateUtil(year=2022, month=4)
DateUtil(year=2022, month=5)
DateUtil(year=2022, month=6)
DateUtil(year=2022, month=7)
DateUtil(year=2022, month=8)


Starting data processing for symbol: GOLDBEES
DateUtil(year=2021, month=2)
DateUtil(year=2021, month=3)
DateUtil(year=2021, month=4)
DateUtil(year=2021, month=5)
DateUtil(year=2021, month=6)


Starting data processing for symbol: HERANBA
DateUtil(year=2021, month=3)


Starting data processing for symbol: IDEA
DateUtil(year=2020, month=7)
DateUtil(year=2020, month=8)
DateUtil(year=2020, month=9)
DateUtil(year=2020, month=10)
DateUtil(year=2020, month=11)
DateUtil(y