## SOURCE TO SOURCE LAYER

> This Notebook reads the RAW files and performs data harmonization.


In [5]:
# Import necessary libraries and utility functions
import pandas as pd

from ETLTools import GlobalPath, utils

In [6]:
# Instantiate GlobalPath
symbol_source_layer_path = GlobalPath("DATA/SOURCE/Symbol")
symbol_bronze_layer_path = GlobalPath("DATA/BRONZE/Symbol/Symbol_data.csv")

### Define a function to read and process an CSV file


In [7]:
# Define a function to read and process an csv file


def read_file(file_path: GlobalPath) -> None:
    """
    Processes CSV files from the SOURCE layer and consolidates them into a single DataFrame.
    The data is then harmonized and saved as a CSV file in the BRONZE layer.
    """
    # Log the reading of the file
    print(f"Processing file: {file_path}")

    # Read each CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Harmonize the DataFrame by replacing punctuation from column names
    df = utils.replace_punctuation_from_columns(df)

    # Drop rows where 'isin' is NaN or null
    df = df.dropna(subset=["isin"])

    df = df.astype(str)

    # Drop columns where all elements are NaN
    df.dropna(how="all", axis=1, inplace=True)
    return df

In [8]:
# Initialize an empty list to store DataFrames
df_symbol_list = []
# Generate file paths for available Excel files in the source layer
file_paths = utils.check_files_availability(symbol_source_layer_path,
    file_pattern="*.csv"
)

# Loop through all CSV files in the SOURCE layer folder
for file_path in file_paths:
    try:
        df = read_file(file_path)
        # Append the DataFrame to the list
        df_symbol_list.append(df)
    except Exception as e:
        # Log any exceptions during file reading
        print(f"Failed to read {file_path} due to error: {e}")

# Concatenate all DataFrames into one
df = pd.concat(df_symbol_list, ignore_index=True)

df = df[
    [
        "instrument_type",
        "isin",
        "symbol",
        "scrip_name",
        "scrip_code",
        "isin_reinvestment",
    ]
]

# Save the result as a CSV file in the BRONZE layer
df.to_csv(symbol_bronze_layer_path, index=None)
print(
    f"Successfully created BRONZE Layer CSV file for Symbol at: {symbol_bronze_layer_path}"
)
# Log the DataFrame debugrmation
df.info()

Number of Files Detected: 3
Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\SOURCE\Symbol\AMFI_Reports.csv
Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\SOURCE\Symbol\EQUITY_Symbols.csv
Processing file: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\SOURCE\Symbol\INDEX_Symbols.csv
Successfully created BRONZE Layer CSV file for Symbol at: C:\Users\prashant.tripathi\Code\PortfolioTracker\DATA\BRONZE\Symbol\Symbol_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16913 entries, 0 to 16912
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   instrument_type    16913 non-null  object
 1   isin               16913 non-null  object
 2   symbol             4330 non-null   object
 3   scrip_name         16913 non-null  object
 4   scrip_code         4330 non-null   object
 5   isin_reinvestment  12583 non-null  object
dtypes: object(6)
memory usage: 792.9