## BRONZE TO SILVER LAYER

### Reading & Validate the Data from the Files


In [41]:
## Import necessary libraries and utility functions
import pandas as pd
import datetime
import pathlib

from common_utilities import (
    check_files_availability,
    global_path,
    replace_punctuation_from_columns,
    find_correct_sheetname,
    find_correct_headers,
    logger,
)

In [42]:
# Generate file_paths
file_paths = check_files_availability(
    pathlib.Path("../DATA/SOURCE/TaxReport"),
    file_pattern="tax_report_*.xlsx",
)
sheet_names = {"Equities": "EQ", "Future & Options": "FO"}
dfs = []
for file_path in file_paths:
    df_dict = pd.read_excel(
        file_path,
        sheet_name=None,
        header=None,
    )
    for sheet_name, lookup_value in sheet_names.items():
        df = find_correct_sheetname(df_dict, sheet_name)
        df = find_correct_headers(df, global_header_regex="scrip_name")
        df = replace_punctuation_from_columns(df)
        dfs.append(df)

# Append to final DataFrame
df = pd.concat(dfs, ignore_index=True)
# Data cleanup
df = df.replace("nan", None)

df["buy_date"] = pd.to_datetime(
    df["buy_date"],
    format="%Y-%m-%d %H:%M:%S",
)

df["sell_date"] = pd.to_datetime(
    df["sell_date"],
    format="%Y-%m-%d %H:%M:%S",
)

# Rename columns for clarity
df = df.rename(
    columns={
        "scrip_opt": "segment",
        "avg_price": "open_price",
        "symbol": "stock_name",
    }
)
# Filter out rows where "scrip_code" is null
df = df[df["scrip_code"].notnull()]
# String and strip
df["scrip_code"] = df["scrip_code"].astype(str).str.strip()
df["stock_name"] = df["stock_name"].astype(str).str.strip()
df

2024-08-01T19:29:32Z - INFO - Number of Files Detected: 5
2024-08-01T19:29:32Z - INFO - Sheet name => Equities
2024-08-01T19:29:32Z - INFO - Sheet name => Future & Options
2024-08-01T19:29:32Z - INFO - Sheet name => Equities
2024-08-01T19:29:32Z - INFO - Sheet name => Future & Options
2024-08-01T19:29:32Z - INFO - Sheet name => Equities
2024-08-01T19:29:32Z - INFO - Sheet name => Future & Options
2024-08-01T19:29:33Z - INFO - Sheet name => Equities
2024-08-01T19:29:33Z - INFO - Sheet name => Future & Options
2024-08-01T19:29:33Z - INFO - Sheet name => Equities
2024-08-01T19:29:33Z - INFO - Sheet name => Future & Options


Unnamed: 0,scrip_name,scrip_code,stock_name,isin,segment,qty,buy_date,buy_rate,buy_amt,sell_date,sell_rate,sell_amt,days,total_pl,short_term,long_term,speculation,turn_over,strike_price
0,BHAGERI IND,530803,BHAGERIA,INE354C01027,EQ,10,2020-05-05,116.55,1165.5,2020-06-15,113,1130,41,-35.5,-35.5,,,1130,
1,BHAGERI IND,530803,BHAGERIA,INE354C01027,EQ,5,2020-05-19,100,500,2020-06-15,113,565,27,65,65,,,565,
2,Heranba Industries Limited,543266,HERANBA,INE694N01015,EQ,12,2021-03-04,627,7524,2021-03-05,820,9840,1,2316,2316,,,9840,
3,Heranba Industries Limited,543266,HERANBA,INE694N01015,EQ,11,2021-03-04,627,6897,2021-03-09,775,8525,5,1628,1628,,,8525,
22,GOLDBEES,590095,GOLDBEES,INF204KB17I5,EQ,2,2021-02-09,41.82,83.64,2021-06-04,42.03,84.06,115,0.42,0.42,,,84.06,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,NIFTY,NIFTY,NIFTY,,PE,25,2024-07-29,125,3125,2024-07-29,83.9,2097.5,0,-1027.5,-1027.5,,,1027.5,24900
189,NIFTY,NIFTY,NIFTY,,PE,25,2024-07-30,101.95,2548.75,2024-07-30,102.75,2568.75,0,20,20,,,20,24900
190,NIFTY,NIFTY,NIFTY,,PE,25,2024-07-30,101.95,2548.75,2024-07-30,102.75,2568.75,0,20,20,,,20,24900
191,NIFTY,NIFTY,NIFTY,,PE,25,2024-07-31,98.85,2471.25,2024-07-31,85.45,2136.25,0,-335,-335,,,335,24950


In [43]:
# Replace scrip code with company name
df_Symbol = pd.read_csv(global_path.symbol_silver_file_path)
# String and strip
df_Symbol["scrip_code"] = df_Symbol["scrip_code"].astype(str).str.strip()

# Merge df with df_Symbol on the matching columns
df = df.merge(
    df_Symbol[["scrip_code", "symbol"]],
    left_on="scrip_code",
    right_on="scrip_code",
    how="left",
)

# Assign the new column 'stock_name' in df to the values from 'symbol'
df["stock_name"] = df["symbol"].combine_first(df["stock_name"])