## BRONZE TO SILVER LAYER

### Silver Layer - Stock Events History


In [7]:
# Import necessary libraries and utility functions

import pandas as pd

from PortfolioTracker.globalpath import global_path
from PortfolioTracker.logger import logger
from PortfolioTracker.utilities import (
    check_files_availability,
    replace_punctuation_from_columns,
)

### Data Processing

This section handles the collection of data from the Bronze layer.

- Files are checked for availability.
- Data is read from each file, processed, and stored in a list of DataFrames.
- Process stock event data from the Bronze layer and transform it for the Silver layer.


In [8]:
# Initialize an empty list to store DataFrames
df_stock_events_list = []

# Generate file paths for available CSV files in the Bronze layer
file_paths = check_files_availability(
    global_path.stockdata_bronze_layer_path, file_pattern="*.csv"
)

# Process each CSV file found in the Bronze layer
for file_path in file_paths:
    logger.info(f"Processing file: {file_path}")

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Extract stock name from file path and add it as a column
    df["stock_name"] = file_path.name.split(".")[0].upper().strip()

    # Append the DataFrame to the list
    df_stock_events_list.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(df_stock_events_list, ignore_index=True)

# Harmonize column names to ensure consistency
df = replace_punctuation_from_columns(df)

# Remove columns that contain only NA values
df.dropna(how="all", axis=1, inplace=True)

2024-08-10T03:37:01Z - INFO - Number of Files Detected: 211
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\10\IDEA.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\10\PNB.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\10\TATAMOTORS.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\10\YESBANK.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\11\IDEA.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\11\PNB.csv
2024-08-10T03:37:01Z - INFO - Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\StockData\2020\

In [9]:
# Convert datetime strings to date objects for consistency
df["date"] = pd.to_datetime(df["date"]).dt.date

# Pivot the DataFrame: columns become rows
df = df.melt(
    id_vars=["date", "stock_name"],
    value_vars=["dividends", "stock_splits", "capital_gains"],
    var_name="event",
    value_name="value",
)

# Convert event names to uppercase for uniformity
df["event"] = df["event"].str.upper()

# Round numerical values to 2 decimal places
df["value"] = df["value"].fillna(0)
df["value"] = df["value"].round(2)

# Filter out rows where the value is zero or less than a threshold
df = df[df["value"] > 0]

# Sort the DataFrame by stock name, date, and event for easier analysis
df = df.sort_values(by=["stock_name", "date", "event"])

# Select relevant columns to export
df = df[["date", "stock_name", "event", "value"]]

# Save the cleaned and transformed DataFrame as a CSV file in the Silver layer
df.to_csv(global_path.stockevents_silver_file_path, index=False)
logger.info(
    "SILVER Layer CSV file for Stock Events history successfully created at:"
)
logger.info(global_path.stockevents_silver_file_path.resolve())

# Log the DataFrame info for verification
df.info()

2024-08-10T03:37:02Z - INFO - SILVER Layer CSV file for Stock Events history successfully created at:
2024-08-10T03:37:02Z - INFO - C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\SILVER\StockEvents\StockEvents_data.csv


<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, 1994 to 2824
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        17 non-null     object 
 1   stock_name  17 non-null     object 
 2   event       17 non-null     object 
 3   value       17 non-null     float64
dtypes: float64(1), object(3)
memory usage: 680.0+ bytes
