## BRONZE TO SILVER LAYER

### Reading & Validate the Data from the Files


In [1]:
## Import necessary libraries and utility functions
import pandas as pd
from datetime import datetime
from pathlib import Path

from common_utilities import (
    check_files_availability,
    global_path,
    replace_punctuation_from_columns,
    find_correct_sheetname,
    find_correct_headers,
    logger,
)

In [2]:
# Generate file_paths
file_paths = check_files_availability(
    Path("../DATA/SOURCE/TaxReport"),
    file_pattern="tax_report_*.xlsx",
)
sheet_names = {"Equities": "EQ", "Future & Options": "FO"}
dfs = []
for file_path in file_paths:
    df_dict = pd.read_excel(
        file_path,
        sheet_name=None,
        header=None,
    )
    for sheet_name, lookup_value in sheet_names.items():
        df = find_correct_sheetname(df_dict, sheet_name)
        df = find_correct_headers(df, global_header_regex="scrip_name")
        df = replace_punctuation_from_columns(df)
        dfs.append(df)

# Append to final DataFrame
df = pd.concat(dfs, ignore_index=True)
# Data cleanup
df = df.replace("nan", None)

df["sell_date"] = pd.to_datetime(
    df["sell_date"],
    format="%Y-%m-%d %H:%M:%S",
)

# Filter out rows where "scrip_code" is null
df = df[df["scrip_code"].notnull()]
# String and strip
df["symbol"] = df["symbol"].astype(str).str.strip()

# Rename columns for clarity
df = df.rename(
    columns={
        "scrip_opt": "segment",
        "symbol": "stock_name",
        "sell_date": "date",
        "qty": "quantity",
        "total_pl": "pnl_amount",
        "buy_rate": "buy_price",
        "sell_rate": "sell_price",
        "buy_amt": "buy_amount",
        "sell_amt": "sell_amount",
    }
)
round_cols = [
    "quantity",
    "buy_price",
    "buy_amount",
    "sell_price",
    "sell_amount",
    "pnl_amount",
]

df[round_cols] = df[round_cols].astype(float).round(2)

2024-08-02T02:01:59Z - INFO - Number of Files Detected: 5
2024-08-02T02:01:59Z - INFO - Sheet name => Equities
2024-08-02T02:01:59Z - INFO - Sheet name => Future & Options
2024-08-02T02:01:59Z - INFO - Sheet name => Equities
2024-08-02T02:01:59Z - INFO - Sheet name => Future & Options
2024-08-02T02:01:59Z - INFO - Sheet name => Equities
2024-08-02T02:01:59Z - INFO - Sheet name => Future & Options
2024-08-02T02:01:59Z - INFO - Sheet name => Equities
2024-08-02T02:01:59Z - INFO - Sheet name => Future & Options
2024-08-02T02:01:59Z - INFO - Sheet name => Equities
2024-08-02T02:01:59Z - INFO - Sheet name => Future & Options


In [3]:
df = df[
    [
        "date",
        "segment",
        "stock_name",
        "strike_price",
        "quantity",
        "buy_price",
        "buy_amount",
        "sell_price",
        "sell_amount",
        "pnl_amount",
    ]
]

# Sort the DataFrame by date and stock name
df = df.sort_values(by=["segment", "stock_name", "date", "strike_price"])

df.to_csv(
    Path("../DATA/BRONZE/TaxReport/TaxReport_data.csv"), index=None
)