## SOURCE TO BRONZE LAYER

### Process:

> The function fetches daily stock data using the Yahoo Finance API (`yfinance`).


In [1]:
# Import necessary libraries and utility functions
import pandas as pd
import yfinance as yf
from MyModules.globalpath import GlobalPath
from MyModules.datetimeutils import DateTimeUtil
from MyModules.utilities import (
    fix_duplicate_column_names,
    replace_punctuation_from_columns,
)

In [2]:
# Instantiate GlobalPath for managing file paths
global_path = GlobalPath()

# Define file paths for stock holding records and stock data bronze layer
stock_holding_records_file_path = global_path.joinpath(
    "DATA/GOLD/Holdings/HoldingsRecords_data.csv"
)
stockdata_bronze_layer_path = global_path.joinpath("DATA/BRONZE/StockData")

In [3]:
def process_file(isin: str, symbol: str, date_obj: DateTimeUtil, output_file):
    """
    Processes historical stock data for a given symbol and saves it to a CSV file.

    Args:
        isin (str): The ISIN code of the stock.
        symbol (str): The symbol of the stock.
        date_obj (DateTimeUtil): DateTimeUtil object representing the date.
        output_file (str): Path to the output CSV file where the processed data will be saved.
    """
    # Log the start of processing for the given symbol
    print(
        f"Starting data processing for:\nsymbol: {symbol}\nyear: {date_obj.year}\nmonth : {date_obj.month}\n"
    )

    # Fetch historical data from Yahoo Finance for the specified date range
    stock = yf.Ticker(isin)
    df = stock.history(
        start=date_obj.start_date,
        end=min(date_obj.end_date, DateTimeUtil.today()),
        interval="1d",
    )

    # Check if the DataFrame is empty and raise an exception if no data is fetched
    if df.empty:
        raise Exception(
            f"No data fetched for {symbol} from {date_obj.start_date} to {date_obj.end_date}"
        )

    # Reset the index to ensure date is a column
    df = df.reset_index()

    # Replace punctuation in column names for consistency
    df = replace_punctuation_from_columns(df)

    # Fix duplicate column names by appending numerical suffixes
    df = fix_duplicate_column_names(df)

    # Round numerical values to 2 decimal places
    df = df.round(2)

    # Save the processed DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    # Log successful processing and saving of data
    print(f"Data processed and saved to\n{output_file}")

In [4]:
def generate_date_list(start_date, end_date):
    """
    Generates a list of DateTimeUtil objects representing the first day of each month
    within the specified date range.

    Args:
        start_date (datetime.date): Start date of the range.
        end_date (datetime.date): End date of the range.

    Returns:
        List[DateTimeUtil]: List of DateTimeUtil objects for each month within the range.
    """
    month_list = []
    current_date = min(start_date, DateTimeUtil.today())
    end_date = min(end_date, DateTimeUtil.today())
    while current_date <= end_date:
        month_list.append(
            DateTimeUtil(current_date.year, current_date.month, 1)
        )
        if current_date.month == 12:
            current_date = current_date.replace(
                year=current_date.year + 1, month=1, day=1
            )
        else:
            current_date = current_date.replace(
                month=current_date.month + 1, day=1
            )
    return month_list

In [5]:
print("Running data processing pipeline...")

# Read stock holding records from CSV
df_stock_holding_records = pd.read_csv(stock_holding_records_file_path)

# Convert date columns to datetime objects
df_stock_holding_records["min_date"] = pd.to_datetime(
    df_stock_holding_records["min_date"]
)
df_stock_holding_records["max_date"] = pd.to_datetime(
    df_stock_holding_records["max_date"]
)

# Iterate over each stock holding record
for _, row in df_stock_holding_records.iterrows():
    # Generate a list of months within the date range for each stock
    date_list = generate_date_list(
        row["min_date"].to_pydatetime(), row["max_date"].to_pydatetime()
    )
    for each in date_list:
        try:
            # Determine the output file path in the bronze layer
            output_file = stockdata_bronze_layer_path.joinpath(
                f"{each.year:04d}/{each.month:02d}/{row['symbol']}.csv"
            )
            # Check if the file exists and skip if it's older than 2 months
            if output_file.exists():
                month_difference = each.month_difference(DateTimeUtil.today())
                if month_difference >= 2:
                    continue

            # Process and save the file
            process_file(
                isin=row["isin"],
                symbol=row["symbol"],
                date_obj=each,
                output_file=output_file,
            )
        except Exception as e:
            # Log any errors encountered during processing
            print(
                f"Error processing data for symbol {row['symbol']} with error:\n{e}"
            )

Running data processing pipeline...
Starting data processing for:
symbol: IRCTC
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/IRCTC.csv
Starting data processing for:
symbol: IRCTC
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/IRCTC.csv
Starting data processing for:
symbol: KPITTECH
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/KPITTECH.csv
Starting data processing for:
symbol: KPITTECH
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/KPITTECH.csv
Starting data processing for:
symbol: TATAMOTORS
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/TATAMOTORS.csv
Starting data processing for:
symbol: TATAMOTORS
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/TATAMOTORS.csv
Starting data processing for:
symbol: TATAPOWER
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/TATAPOWER.csv
Starting data processing for:
symbol: TATAPOWER
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/TATAPOWER.csv
Starting data processing for:
symbol: MIRAE_ASSET_ELSS_TAX_SAVER_FUND_DIRECT_PLAN_GROWTH
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/MIRAE_ASSET_ELSS_TAX_SAVER_FUND_DIRECT_PLAN_GROWTH.csv
Starting data processing for:
symbol: MIRAE_ASSET_ELSS_TAX_SAVER_FUND_DIRECT_PLAN_GROWTH
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/MIRAE_ASSET_ELSS_TAX_SAVER_FUND_DIRECT_PLAN_GROWTH.csv
Starting data processing for:
symbol: SBI_LONG_TERM_EQUITY_FUND_DIRECT_PLAN_GROWTH
year: 2024
month : 7



Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/07/SBI_LONG_TERM_EQUITY_FUND_DIRECT_PLAN_GROWTH.csv
Starting data processing for:
symbol: SBI_LONG_TERM_EQUITY_FUND_DIRECT_PLAN_GROWTH
year: 2024
month : 8

Data processed and saved to
/home/runner/work/PortfolioTracker/PortfolioTracker/DATA/BRONZE/StockData/2024/08/SBI_LONG_TERM_EQUITY_FUND_DIRECT_PLAN_GROWTH.csv


In [6]:
# import pandas as pd

# # Read the CSV file
# df = pd.read_csv(
#     https://www.nseindia.com/get-quotes/equity?symbol=GOLDBEES
# )

# # Rename columns to match the desired format
# df.columns = [
#     "Date",
#     "series",
#     "OPEN",
#     "HIGH",
#     "LOW",
#     "PREV. CLOSE",
#     "ltp",
#     "close",
#     "vwap",
#     "52W H",
#     "52W L",
#     "VOLUME",
#     "VALUE",
#     "No of trades",
# ]

# # Select and rename columns as per the desired output
# df = df[["Date", "OPEN", "HIGH", "LOW", "close", "VOLUME"]].copy()

# # Rename columns to the new format
# df.columns = ["Date", "open", "high", "low", "close", "volume"]

# # Convert 'Date' column to datetime format and adjust timezone
# df["Date"] = pd.to_datetime(df["Date"], format="%d-%b-%Y")
# df["date"] = df["Date"].dt.strftime("%Y-%m-%d") + " 00:00:00+05:30"

# # Ensure 'volume' is an integer
# df["volume"] = df["volume"].str.replace(",", "").astype(float).astype(int)

# # Add placeholder columns for dividends and stock_splits
# df["dividends"] = 0.0
# df["stock_splits"] = 0.0

# year = 2021
# for month in range(2, 7):
#     output_file = stockdata_bronze_layer_path.joinpath(
#         f"{year:04d}/{month:02d}/GOLDBEES.csv"
#     )
#     df[df["Date"].dt.month == month][[
#         "date",
#         "open",
#         "high",
#         "low",
#         "close",
#         "volume",
#         "dividends",
#         "stock_splits",
#     ]].to_csv(output_file, index=False)