## SOURCE TO BRONZE LAYER

### Process:

> The function fetches daily stock data using the Yahoo Finance API (`yfinance`).


In [None]:
# Import necessary libraries
import json
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import requests
import yfinance as yf

from StockETL.datetimeutils import DateTimeUtil
from StockETL.globalpath import GlobalPath

In [None]:
# Importing Common Utility Function
%run ../COMMON/common_utility.ipynb

In [None]:
# Define file paths
stock_tickers_config_path = GlobalPath("DATA/CONFIG/CONSTANTS/stock_tickers.json")
holding_history_path = GlobalPath("DATA/SOURCE/Holding")
stockdata_bronze_schema_file_path = GlobalPath(
    "DATA/CONFIG/DATA_CONTRACTS/BRONZE/StockData.json"
)
failed_records_path = GlobalPath("DATA/FAILED/failed_records.json")

In [None]:
# Dictionary for stock ticker overrides
# Open and read the JSON file
OVERWRITE_TICKERS = {}
with open(stock_tickers_config_path, encoding="utf-8") as f:
    # Get the contract_fields from json data
    OVERWRITE_TICKERS = json.load(f)

In [None]:
def generate_date_list(start_date, end_date):
    """
    Generates a list of DateTimeUtil objects representing the first day of each month
    within the specified date range.

    Args:
        start_date (datetime.date): Start date of the range.
        end_date (datetime.date): End date of the range.

    Returns:
        List[DateTimeUtil]: List of DateTimeUtil objects for each month within the range.
    """
    month_list = []
    current_date = min(start_date, DateTimeUtil.today())
    end_date = min(end_date, DateTimeUtil.today())
    while current_date <= end_date:
        month_list.append(DateTimeUtil(current_date.year, current_date.month, 1))
        if current_date.month == 12:
            current_date = current_date.replace(
                year=current_date.year + 1, month=1, day=1
            )
        else:
            current_date = current_date.replace(month=current_date.month + 1, day=1)
    return month_list

In [None]:
def download_file_from_github(output_file):
    github_data_url = f"https://raw.githubusercontent.com/PtPrashantTripathi/PortfolioTracker/main/DATA/BRONZE/StockData/{output_file.name}"
    response = requests.get(github_data_url)
    if response.status_code == 200:
        with open(output_file, "wb") as file:
            file.write(response.content)
            return True
    else:
        return False

In [None]:
# Function to download data


def process_stock_data(row: dict):
    all_status = []
    try:
        stock_ticker = yf.Ticker(OVERWRITE_TICKERS.get(row["symbol"], row["isin"]))
        stockdata_bronze_layer_path = GlobalPath(
            f"DATA/BRONZE/StockData/{row["symbol"]}"
        )
        date_list = generate_date_list(
            row["min_date"].to_pydatetime(), row["max_date"].to_pydatetime()
        )
        for date in date_list:
            status, info = None, None
            output_file = stockdata_bronze_layer_path.joinpath(
                f"{row['symbol']}_{date.year:04d}_{date.month:02d}.csv"
            )

            if (
                output_file.exists()
                and date.month_difference(DateTimeUtil.today()) >= 1
            ):
                status = "exists"
            elif download_file_from_github(output_file):
                status = "downloaded"
                info = "github"
            else:
                try:
                    df = stock_ticker.history(
                        start=date.start_date,
                        end=min(date.end_date, DateTimeUtil.today()),
                        interval="1d",
                        actions=True,
                        rounding=True,
                    )

                    if df.empty:
                        raise ValueError("No data returned")

                    df = df.reset_index()

                    # Replace punctuation from column names for consistency
                    df = replace_punctuation_from_columns(df)

                    # Fix duplicate column names by appending numerical suffixes
                    df = fix_duplicate_column_names(df)

                    # Drop rows where all elements are NaN
                    df = df.dropna(how="all")

                    # Align Datafame with DataContract
                    df = align_with_datacontract(df, stockdata_bronze_schema_file_path)

                    df.to_csv(output_file, index=False)
                    status = "downloaded"
                    info = "yfinance"
                except Exception as e:
                    status = "failed"
                    info = str(e)
            all_status.append(
                {
                    "symbol": row["symbol"],
                    "start_date": date.start_date,
                    "end_date": date.end_date,
                    "status": status,
                    "info": info,
                    "file": GlobalPath.relative(output_file),
                }
            )
    except Exception as e:
        all_status.append(
            {
                "symbol": row["symbol"],
                "start_date": row["min_date"].to_pydatetime(),
                "end_date": row["max_date"].to_pydatetime(),
                "status": "failed",
                "error": str(e),
            }
        )

    return all_status

In [None]:
# Generate file paths for available Excel files in the source layer
file_paths = check_files_availability(
    holding_history_path, file_pattern="Holding_data.csv"
)

# Load holding history
df_holding_history = pd.concat([pd.read_csv(file_path) for file_path in file_paths])
df_holding_history["min_date"] = pd.to_datetime(df_holding_history["min_date"])
df_holding_history["max_date"] = pd.to_datetime(df_holding_history["max_date"])

# Calculate the min and max dates for each stock
df_holding_history = (
    df_holding_history.groupby(["segment", "exchange", "symbol"])
    .agg(
        min_date=("min_date", "min"),
        max_date=("max_date", "max"),
        isin=("isin", "first"),
    )
    .reset_index()
)

In [None]:
# import os

# os.environ["SSL_CERT_DIR"] = "/opt/homebrew/etc/ca-certificates"
# os.environ["SSL_CERT_FILE"] = "/opt/homebrew/etc/ca-certificates/cert.pem"
# os.environ["REQUESTS_CA_BUNDLE"] = "/opt/homebrew/etc/ca-certificates/cert.pem"

In [None]:
# Process in parallel
all_status = []
with ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(
        process_stock_data, df_holding_history.to_dict(orient="records")
    )
    for result in results:
        all_status.extend(result)

In [None]:
# Save failed records
df = pd.DataFrame(all_status)
df[~df["status"].isin(["exists", "downloaded"])]