In [None]:
# Importing necessary files and packages
import json
import os
import re
from datetime import datetime
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd

from StockETL.globalpath import GlobalPath

In [None]:
# Removing punctuations from the columns
def replace_punctuation_from_string(input_str):
    """replace punctuation from string Function"""
    regex_escape_string = r"""!"#$%&'()*+,-./:;<=>?@[\]^`{|}~"""
    regex_remove_punctuation = re.compile("[%s]" % re.escape(regex_escape_string))
    output_str = (
        regex_remove_punctuation.sub("", str(input_str))
        .strip()
        .replace(" ", "_")
        .replace("\n", "_")
        .replace("\t", "_")
        .replace("\r", "_")
        .lower()
    )
    while "__" in output_str:
        output_str = output_str.replace("__", "_")
    return output_str

In [None]:
def replace_punctuation_from_columns(df_pandas):
    """Pandas version of replace punctuation Function"""
    new_col_names = []
    for col_name in df_pandas.columns:
        new_col_name = replace_punctuation_from_string(col_name)
        new_col_names.append(new_col_name)
    df_pandas.columns = new_col_names
    return df_pandas

In [None]:
# Function to fix duplicate column names in a Pandas DataFrame
def fix_duplicate_column_names(df_pandas):
    """
    This function receives a Pandas DataFrame and ensures that each column name is unique.
    If a duplicate name is found, the function renames it by appending an incremental number, e.g. '_1', '_2', etc.
    The function returns a new DataFrame with the updated column names.
    """
    result = []
    counts = {}
    for column_name in df_pandas.columns:
        column_name = replace_punctuation_from_string(str(column_name))
        if column_name not in counts:
            counts[column_name] = 0
            result.append(column_name)
        else:
            counts[column_name] += 1
            result.append(f"{column_name}_{counts[column_name]}")
    df_pandas.columns = result

    if len(result) == 0:
        raise ValueError("Duplicate column issue!")
    else:
        return df_pandas

In [None]:
# Auxiliary functions to gather debug of given pandas dataframe
def find_correct_sheetname(df_pandas, sheet_name_regex):
    """
    Finds the first sheet name that matches the given regular expression.

    Parameters:
    df_pandas (dict): A dictionary where keys are sheet names and values are the corresponding data frames.
    sheet_name_regex (str): A regular expression pattern to match against the sheet names.

    Returns:
    DataFrame: The data frame corresponding to the first sheet name that matches the regex.
    """
    # Compile the regular expression for efficiency
    pattern = re.compile(sheet_name_regex, re.IGNORECASE)

    # Iterate through the sheet names
    for sheet_name in df_pandas.keys():
        # Check if the sheet name matches the regex pattern
        if pattern.search(sheet_name):
            print(f"Sheet name => {sheet_name}")
            return df_pandas[sheet_name]

    # Raise an error if no matching sheet name is found
    raise ValueError("Sheet name not found!")

In [None]:
# Functions to find data with correct header column
def find_correct_headers(df_pandas, global_header_regex=None):
    """
    Auxiliary functions to gather debug of given pandas dataframe
    """

    pattern = re.compile(global_header_regex, re.IGNORECASE)
    # Iterate through the pandas data
    for header_row_index, row in df_pandas.iterrows():
        for each in row.values:
            # Check if the sheet name matches the regex pattern
            if pattern.match(replace_punctuation_from_string(str(each))):
                df = df_pandas.iloc[header_row_index + 1 :]
                df.columns = df_pandas.iloc[header_row_index]
                # drop col which are all null
                # df = df.dropna(axis=1, how="all")
                return df
    raise ValueError("Header not found!")

In [None]:
def check_files_availability(
    dir_path: Union[str, Path],
    file_pattern: str = "*",
    timestamp: datetime = datetime.strptime("2000-01-01", "%Y-%m-%d"),
) -> List[Path]:
    """
    Checks for newly added or modified files in a directory after a specific timestamp.

    Args:
        dir_path (Union[str, Path]): The directory to check for files.
        file_pattern (str): The pattern to filter files.
        timestamp (datetime): The timestamp to compare file modification times against.

    Returns:
        list: A list of paths to files that were added or modified after the given timestamp.
    """
    # List to store paths of matched files
    file_paths = []

    # Iterate over all files in the directory and subdirectories
    for file_path in Path(dir_path).rglob(file_pattern):
        if file_path.is_file():
            file_modified_time = datetime.fromtimestamp(os.path.getmtime(file_path))
            # Check if file was modified after the given timestamp
            if file_modified_time > timestamp:
                file_paths.append(file_path)

    # Log the number of detected files
    num_files = len(file_paths)
    if num_files > 0:
        print(f"Number of Files Detected => {num_files}")
        return file_paths
    else:
        raise FileNotFoundError(
            f"No processable data available in the directory: {file_path}"
        )

In [None]:
def align_with_datacontract(
    df: pd.DataFrame, data_contract_path: GlobalPath, rounding=True
) -> pd.DataFrame:
    """
    Aligns the DataFrame with the DataContract specified in a JSON file.
    This function casts DataFrame columns to the data types specified in the schema,
    creates missing columns with the correct data type, and arranges the columns in order.

    Args:
        df (pd.DataFrame): The input DataFrame to align.
        data_contract_path (GlobalPath): Path to the JSON file containing DataContract information.

    Returns:
        pd.DataFrame: The DataFrame aligned with the DataContract.
    """

    # Load DataContract from the JSON file
    with open(data_contract_path, encoding="utf-8") as schema_file:
        datacontract = json.load(schema_file)
        # print(f"DataContract loaded from => {data_contract_path}")

    # Extract schema definitions and column order from the JSON
    data_schema = datacontract.get("data_schema", [])
    order_by = datacontract.get("order_by", [])

    # Iterate over the schema to align DataFrame columns
    for col_info in data_schema:
        col_name = col_info["col_name"]
        col_type = col_info["data_type"]

        if col_name in df.columns:
            # Cast column to the specified data type
            df[col_name] = df[col_name].astype(col_type)
        else:
            # Create missing column with NaN values and specified data type
            df[col_name] = pd.Series([None] * len(df), dtype=col_type)

    # Ensure select columns specified by the schema
    all_columns = [each["col_name"] for each in data_schema]
    df = df[all_columns]

    # Reorder DataFrame columns according to the order specified by the schema
    order_by = list(dict.fromkeys(order_by + all_columns))
    df = df.sort_values(by=order_by).reset_index(drop=True)

    # Round numerical values to 2 decimal places
    if rounding:
        df = df.round(2)

    return df

In [None]:
# DataFrame to DataContract
def get_correct_datatype(input_datatype):
    input_datatype = str(input_datatype).lower().strip()
    datatypes_list = {
        "Date": ["date"],
        "string": ["string", "varchar", "char", "text", "object"],
        "Long": ["bigint", "int", "tinyint", "long"],
        "Timestamp": ["timestamp", "datetime"],
        "Double": ["double", "float", "decimal"],
        "Boolean": ["bool", "boolean"],
    }
    for datatype_name, datatype_values in datatypes_list.items():
        if input_datatype in datatype_values:
            return datatype_name
    print(f"undefined data type => {input_datatype}")
    return input_datatype

In [None]:
# DataType	Description
# date	Store Date Only
# datetime	Store Date and Time
# string	String or character
# int	Represents 4-byte signed integer numbers. The range of numbers is from -2147483648 to 2147483647.
# long	Represents 8-byte signed integer numbers. The range of numbers is from -9223372036854775808 to 9223372036854775807
# bit	bit
# timestamp	date with time detail in ISOformat
# double	Represents 8-byte double-precision floating point numbers
# float	Represents 4-byte single-precision floating point numbers.
# boolean	True or False

In [None]:
def create_data_contract(df, schema_path):
    # Create the list of dictionaries
    data_schema = [
        {
            "col_name": col,
            "data_type": str(dtype),  # get_correct_datatype(str(dtype)),
        }
        for col, dtype in df.dtypes.to_dict().items()
    ]
    # Write the dictionary to a JSON file
    with open(schema_path, "w") as json_file:
        json.dump({"data_schema": data_schema}, json_file, indent=4)

    print(f"Dictionary written to {schema_path}")

In [None]:
# Replace NaN values with empty strings in a DataFrame
def replace_nan_with_empty(data):
    if isinstance(data, dict):
        return {key: replace_nan_with_empty(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [replace_nan_with_empty(item) for item in data]
    elif isinstance(data, float) and np.isnan(data):
        return ""
    return data