## SOURCE TO BRONZE LAYER

### Process:

> The function fetches daily stock data using the Yahoo Finance API (`yfinance`).


In [None]:
# Import necessary libraries
import json
import pandas as pd
import requests
import yfinance as yf
from datetime import timedelta, datetime
from pyrate_limiter import Duration, Limiter, RequestRate
from requests_ratelimiter import LimiterMixin, MemoryQueueBucket
from requests_cache import CacheMixin, SQLiteCache
from concurrent.futures import ThreadPoolExecutor
from StockETL import DateTimeUtil, GlobalPath

In [None]:
# Importing Common Utility Function
%run ../COMMON/common_utility.ipynb

In [None]:
# Define file paths
stock_tickers_config_path = GlobalPath("CONFIG/CONSTANTS/stock_tickers.json")
holding_history_path = GlobalPath("DATA/SOURCE/Holding/Holding_data.csv")
stockdata_bronze_layer_path = GlobalPath("DATA/BRONZE/StockData")
stockdata_bronze_schema_file_path = GlobalPath(
    "CONFIG/DATA_CONTRACTS/BRONZE/StockData.json"
)
failed_records_path = GlobalPath("DATA/FAILED/failed_records.json")

In [None]:
# Set up rate limiter
class CachedLimiterSession(CacheMixin, LimiterMixin, requests.Session):
    pass


history_rate = RequestRate(1, Duration.SECOND * 2)
limiter = Limiter(history_rate)
session = CachedLimiterSession(
    limiter=limiter,
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache(".cache/session", expire_after=timedelta(hours=1)),
)

In [None]:
def download_file_from_github(output_file):
    github_data_url = f"https://raw.githubusercontent.com/PtPrashantTripathi/PortfolioTracker/main/DATA/BRONZE/StockData/{output_file.name}"
    response = requests.get(github_data_url)
    if response.status_code == 200:
        with open(output_file, "wb") as file:
            file.write(response.content)
            return True
    else:
        return False

In [None]:
# Dictionary for stock ticker overrides
# Open and read the JSON file
OVERWRITE_TICKERS = {}
with open(stock_tickers_config_path, encoding="utf-8") as f:
    # Get the contract_fields from json data
    OVERWRITE_TICKERS = json.load(f)

In [None]:
# Function to download data
def process_stock_data(row):
    failed_records = []
    try:
        stock_ticker = yf.Ticker(
            OVERWRITE_TICKERS.get(row["symbol"], row["isin"]),
            session=session,
        )
        date_list = generate_date_list(
            row["min_date"].to_pydatetime(), row["max_date"].to_pydatetime()
        )

        for date in date_list:
            output_file = stockdata_bronze_layer_path.joinpath(
                f"{row['symbol']}_{date.year:04d}_{date.month:02d}.csv"
            )
            if (
                output_file.exists()
                and date.month_difference(DateTimeUtil.today()) >= 1
            ):
                continue

            if download_file_from_github(output_file):
                continue

            try:
                df = stock_ticker.history(
                    start=date.start_date,
                    end=min(date.end_date, DateTimeUtil.today()),
                    interval="1d",
                    actions=True,
                    rounding=True,
                )
                
                if df.empty:
                    raise ValueError("No data returned")

                df = df.reset_index()

                # Replace punctuation from column names for consistency
                df = replace_punctuation_from_columns(df)

                # Fix duplicate column names by appending numerical suffixes
                df = fix_duplicate_column_names(df)

                # Drop rows where all elements are NaN
                df = df.dropna(how="all")

                # Align Datafame with DataContract
                df = align_with_datacontract(df, stockdata_bronze_schema_file_path)

                df.to_csv(output_file, index=False)
                print(f"Saved: {output_file}")

            except Exception as e:
                failed_records.append(
                    {
                        "symbol": row["symbol"],
                        "date_range": f"{date.start_date} to {date.end_date}",
                        "error": str(e),
                    }
                )

    except Exception as e:
        failed_records.append({"symbol": row["symbol"], "error": str(e)})

    return failed_records

In [None]:
# Load holding history
df_holding_history = pd.read_csv(holding_history_path)
df_holding_history["min_date"] = pd.to_datetime(df_holding_history["min_date"])
df_holding_history["max_date"] = pd.to_datetime(df_holding_history["max_date"])

In [None]:
# Process in parallel
failed_records = []
with ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(
        process_stock_data, df_holding_history.to_dict(orient="records")
    )
    for result in results:
        failed_records.extend(result)

In [None]:
# Save failed records
pd.DataFrame(failed_records)