## BRONZE TO SILVER LAYER

### Reading & Validate the Data from the Files


In [1]:
## Import necessary libraries and utility functions
import pandas as pd
from datetime import datetime
from pathlib import Path
from common_utilities import (
    check_files_availability,
    global_path,
    replace_punctuation_from_columns,
    find_correct_sheetname,
    find_correct_headers,
    logger,
)

In [2]:
# Generate file_paths
file_paths = check_files_availability(
    Path("../DATA/SOURCE/TaxReport"),
    file_pattern="tax_report_*.xlsx",
)
sheet_names = {"Equities": "EQ", "Future & Options": "FO"}
dfs = []
for file_path in file_paths:
    df_dict = pd.read_excel(
        file_path,
        sheet_name=None,
        header=None,
    )
    for sheet_name, lookup_value in sheet_names.items():
        df = find_correct_sheetname(df_dict, sheet_name)
        df = find_correct_headers(df, global_header_regex="scrip_name")
        df = replace_punctuation_from_columns(df)
        df = df.reset_index()
        df["segment"] = lookup_value
        dfs.append(df)
# Append to final DataFrame
df = pd.concat(dfs, ignore_index=True)
# Data cleanup
df = df.replace("nan", None)
# Filter out rows where "scrip_code" is null
df = df[df["symbol"].notnull()]
# String and strip
df["symbol"] = df["symbol"].astype(str).str.strip()
df["close_datetime"] = pd.to_datetime(
    df["sell_date"],
    format="%Y-%m-%d %H:%M:%S",
)
df["open_datetime"] = pd.to_datetime(
    df["buy_date"],
    format="%Y-%m-%d %H:%M:%S",
)
# Rename columns for clarity
df = df.rename(
    columns={
        "symbol": "stock_name",
        # "scrip_opt": "segment",
        "qty": "quantity",
        "buy_rate": "open_price",
        "buy_amt": "open_amount",
        "sell_rate": "close_price",
        "sell_amt": "close_amount",
        "total_pl": "pnl_amount",
    }
)
# df

2024-08-05T23:51:30Z - INFO - Number of Files Detected: 5
2024-08-05T23:51:31Z - INFO - Sheet name => Equities
2024-08-05T23:51:31Z - INFO - Sheet name => Future & Options
2024-08-05T23:51:31Z - INFO - Sheet name => Equities
2024-08-05T23:51:31Z - INFO - Sheet name => Future & Options
2024-08-05T23:51:31Z - INFO - Sheet name => Equities
2024-08-05T23:51:31Z - INFO - Sheet name => Future & Options
2024-08-05T23:51:31Z - INFO - Sheet name => Equities
2024-08-05T23:51:31Z - INFO - Sheet name => Future & Options
2024-08-05T23:51:31Z - INFO - Sheet name => Equities
2024-08-05T23:51:31Z - INFO - Sheet name => Future & Options


In [3]:
from datetime import datetime, timedelta

def next_thursday(given_date):
    """
    Returns the next Thursday date from the given date.
    If the given date is already a Thursday, it returns the same date.

    Parameters:
    given_date (datetime): The input date.

    Returns:
    datetime: The next Thursday date or the same date if it's already Thursday.
    """
    # Check if the given date is a Thursday (weekday 3 in Python's datetime module)
    if given_date.weekday() == 3:
        return given_date
    
    # Calculate the number of days until the next Thursday
    days_until_thursday = (3 - given_date.weekday() + 7) % 7
    # If the result is 0, set it to 7 to move to the next week
    days_until_thursday = 7 if days_until_thursday == 0 else days_until_thursday

    # Add the days to the given date to get the next Thursday
    next_thursday_date = given_date + timedelta(days=days_until_thursday)
    
    return next_thursday_date

df["expiry"] = df["close_datetime"].apply(next_thursday)
df["expiry"] = pd.to_datetime(df["expiry"])
df["expiry"] = df["expiry"].dt.strftime("%d%b%Y")

In [4]:

# Function to apply the conditional concatenation
def concat_stock_name(row: pd.Series) -> str:
    """
    Concatenate stock names based on the instrument type.

    Parameters:
    row (pd.Series): A row of DataFrame containing instrument data.

    Returns:
    str: The concatenated stock name.
    """
    if row["scrip_opt"] == "CE":
        stock_name = (
            str(row["stock_name"])
            + "-CE-"
            + str(row["strike_price"])
            + "-"
            + str(row["expiry"])
        )
    elif row["scrip_opt"] == "PE":
        stock_name = (
            str(row["stock_name"])
            + "-PE-"
            + str(row["strike_price"])
            + "-"
            + str(row["expiry"])
        )
    else:
        stock_name = str(row["stock_name"])
    return stock_name.strip().upper()

# Apply the function to the DataFrame
df["stock_name"] = df.apply(concat_stock_name, axis=1)


In [5]:

round_cols = [
    each
    for each in df.columns
    if any(
        a in each
        for a in [
            "quantity",
            "price",
            "amount",
        ]
    )
]
df[round_cols] = df[round_cols].astype(float).round(2)
df = df[
    [
        "segment",
        "stock_name",
        "quantity",
        "open_datetime",
        "open_price",
        "open_amount",
        "close_datetime",
        "close_price",
        "close_amount",
        "days",
        "pnl_amount",
    ]
]
df = df.sort_values(by=["segment", "stock_name", "close_datetime", "open_datetime"])
df.to_csv(Path("../DATA/BRONZE/TaxReport/TaxReport_data.csv"), index=None)
df

Unnamed: 0,segment,stock_name,quantity,open_datetime,open_price,open_amount,close_datetime,close_price,close_amount,days,pnl_amount
0,EQ,BHAGERIA,10.0,2020-05-05,116.55,1165.50,2020-06-15,113.00,1130.00,41,-35.50
1,EQ,BHAGERIA,5.0,2020-05-19,100.00,500.00,2020-06-15,113.00,565.00,27,65.00
50,EQ,BPCL,43.0,2021-08-06,457.00,19651.00,2022-08-10,328.85,14140.55,369,-5510.45
22,EQ,GOLDBEES,2.0,2021-02-09,41.82,83.64,2021-06-04,42.03,84.06,115,0.42
2,EQ,HERANBA,12.0,2021-03-04,627.00,7524.00,2021-03-05,820.00,9840.00,1,2316.00
...,...,...,...,...,...,...,...,...,...,...,...
192,FO,NIFTY-PE-24900-01AUG2024,25.0,2024-07-30,101.95,2548.75,2024-07-30,102.75,2568.75,0,20.00
193,FO,NIFTY-PE-24900-01AUG2024,25.0,2024-07-30,101.95,2548.75,2024-07-30,102.75,2568.75,0,20.00
194,FO,NIFTY-PE-24950-01AUG2024,25.0,2024-07-31,98.85,2471.25,2024-07-31,85.45,2136.25,0,-335.00
195,FO,NIFTY-PE-24950-01AUG2024,125.0,2024-08-01,7.00,875.00,2024-08-01,0.05,6.25,0,-868.75
