## SOURCE TO BRONZE LAYER

### Process:

> The function fetches daily stock data using the Yahoo Finance API (`yfinance`).


In [1]:
# Importing Common Utility Funcation
import pandas as pd
import yfinance as yf
from ETLTools import DateTimeUtil, GlobalPath

In [2]:
# Import necessary libraries and utility functions
%run ../COMMON/common_utility.ipynb

In [3]:
# Define file paths
holdingshistory_source_layer_path = GlobalPath(
    "DATA/SOURCE/Holdings/HoldingsHistory_data.csv"
)
stockdata_bronze_layer_path = GlobalPath("DATA/BRONZE/StockData")

In [4]:
def process_file(
    stock_ticker,
    stock_name: str,
    date: DateTimeUtil,
    output_file: GlobalPath,
):
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.

    Args:
        stock_ticker : stock_ticker of the stock.
        stock_name (str): The stock_name of the stock.
        date (DateTimeUtil): DateTimeUtil object representing the date.
        output_file (str): Path to the output CSV file where the processed data will be saved.
    """
    # Fetch historical data from Yahoo Finance for the specified date range
    df = stock_ticker.history(
        start=date.start_date,
        end=min(date.end_date, DateTimeUtil.today()),
        interval="1d",
    )

    # Check if the DataFrame is empty and raise an exception if no data is fetched
    if df.empty:
        raise Exception(
            f"No data fetched for {stock_name} from {date.start_date} to {date.end_date}"
        )

    # Reset the index to ensure date is a column
    df = df.reset_index()

    # Replace punctuation in column names for consistency
    df = replace_punctuation_from_columns(df)

    # Fix duplicate column names by appending numerical suffixes
    df = fix_duplicate_column_names(df)

    # Round numerical values to 2 decimal places
    df = df.round(2)

    # Save the processed DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    # Log successful processing and saving of data
    print(f"Data processed and saved to: {output_file}")

In [5]:
# Dictionary for overwriting specific stock tickers
overwrite_stock_ticker = {"BAJAJHFL": "BAJAJHFL.NS"}

In [6]:
print("Running data processing pipeline...")

# Load holdings data
df_holdings_history = pd.read_csv(holdingshistory_source_layer_path)
print(f"Loaded data from: {holdingshistory_source_layer_path}")

# Iterate over each stock holding record
for _, row in df_holdings_history.iterrows():
    try:
        print(f"Processing data for symbol {row['symbol']}:")
        # Check if symbol has an override in the dictionary else Fallback to the ISIN ticker
        stock_ticker = yf.Ticker(
            overwrite_stock_ticker.get(row["symbol"], row["isin"]),
        )
        if stock_ticker.info and stock_ticker.info.get("regularMarketPrice"):
            # Generate a list of months within the date range for each stock
            date_list = generate_date_list(
                row["min_date"].to_pydatetime(), row["max_date"].to_pydatetime()
            )
            for date in date_list:
                # Determine the output file path in the bronze layer
                output_file = stockdata_bronze_layer_path.joinpath(
                    f"{row['symbol']}_{date.year:04d}_{date.month:02d}.csv"
                )
                # Check if the file exists and skip if it's older than 2 months
                if output_file.exists():
                    month_difference = date.month_difference(
                        DateTimeUtil.today()
                    )
                    if month_difference >= 1:
                        continue

                # Process and save the file
                process_file(
                    stock_ticker=stock_ticker,
                    stock_name=row["symbol"],
                    date=date,
                    output_file=output_file,
                )
        else:
            # If no valid ticker is found, raise an error
            raise KeyError(f"Ticker Not Found: {row['symbol']}")
    except Exception as e:
        # Log any errors encountered during processing
        print(f"Error processing:\n{e}")

Running data processing pipeline...
Loaded data from: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\SOURCE\Holdings\HoldingsHistory_data.csv
Processing data for symbol BAJAJHFL:
Error processing:
'Ticker Not Found: BAJAJHFL'
Processing data for symbol BHAGERIA:
Error processing:
'Ticker Not Found: BHAGERIA'
Processing data for symbol BPCL:
Error processing:
'Ticker Not Found: BPCL'
Processing data for symbol GOLDBEES:
Error processing:
HTTPSConnectionPool(host='guce.yahoo.com', port=443): Max retries exceeded with url: /consent (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1002)')))
Processing data for symbol HERANBA:
Error processing:
HTTPSConnectionPool(host='guce.yahoo.com', port=443): Max retries exceeded with url: /consent (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certific

In [7]:
# import pandas as pd

# # Read the CSV file
# df = pd.read_csv(
#     https://www.nseindia.com/get-quotes/equity?symbol=GOLDBEES
# )

# # Rename columns to match the desired format
# df.columns = [
#     "Date",
#     "series",
#     "OPEN",
#     "HIGH",
#     "LOW",
#     "PREV. CLOSE",
#     "ltp",
#     "close",
#     "vwap",
#     "52W H",
#     "52W L",
#     "VOLUME",
#     "VALUE",
#     "No of trades",
# ]

# # Select and rename columns as per the desired output
# df = df[["Date", "OPEN", "HIGH", "LOW", "close", "VOLUME"]].copy()

# # Rename columns to the new format
# df.columns = ["Date", "open", "high", "low", "close", "volume"]

# # Convert 'Date' column to datetime format and adjust timezone
# df["Date"] = pd.to_datetime(df["Date"], format="%d-%b-%Y")
# df["date"] = df["Date"].dt.strftime("%Y-%m-%d") + " 00:00:00+05:30"

# # Ensure 'volume' is an integer
# df["volume"] = df["volume"].str.replace(",", "").astype(float).astype(int)

# # Add placeholder columns for dividends and stock_splits
# df["dividends"] = 0.0
# df["stock_splits"] = 0.0

# year = 2021
# for month in range(2, 7):
#     output_file = stockdata_bronze_layer_path.joinpath(
#         f"{year:04d}/{month:02d}/GOLDBEES.csv"
#     )
#     df[df["Date"].dt.month == month][[
#         "date",
#         "open",
#         "high",
#         "low",
#         "close",
#         "volume",
#         "dividends",
#         "stock_splits",
#     ]].to_csv(output_file, index=False)