In [None]:
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd

In [None]:
def import_file(file_path: str, **kwargs) -> pd.DataFrame:
    try:
        file_extension = file_path.split(".")[-1].lower()

        if file_extension == "csv":
            df = pd.read_csv(file_path, **kwargs)
        elif file_extension == "xlsx":
            df = pd.read_excel(file_path, **kwargs)
        elif file_extension == "parquet":
            df = pd.read_parquet(file_path, **kwargs)
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")

        return df

    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")
        raise
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error for file {file_path} - {str(e)}")
        raise
    except Exception as e:
        print(f"An unexpected error occured: {str(e)}")
        raise

def month_diff(start_date, end_date):
    rd = relativedelta(end_date, start_date)
    return rd.years * 12 + rd.months

def adjust_composite_score(df: pd.DataFrame, composite_score_column: str) -> pd.DataFrame:
    df["adjCompositeScore"] = np.where(
        df[composite_score_column].between(0, 100, inclusive="right"),
        df[composite_score_column],
        np.where(df[composite_score_column] > 100, df[composite_score_column] / 10, np.NaN),
    )
    return df

def assign_composite_rate(df: pd.DataFrame, composite_score_column) -> pd.DataFrame:
    # Composite Rating Criteria
    # Corporate Composite Rating
    corporate_A = df["adjCompositeScore"] >= 92
    corporate_B1 = df["adjCompositeScore"].between(89, 92, inclusive="left")
    corporate_B2 = df["adjCompositeScore"].between(86, 89, inclusive="left")
    corporate_B3 = df["adjCompositeScore"].between(81, 86, inclusive="left")
    corporate_B4 = df["adjCompositeScore"].between(78, 81, inclusive="left")
    corporate_C1 = df["adjCompositeScore"].between(74, 78, inclusive="left")
    corporate_C2 = df["adjCompositeScore"].between(70, 74, inclusive="left")
    corporate_C3 = df["adjCompositeScore"] < 70

    # SMEs Composite Rating
    sme_A = df["adjCompositeScore"] >= 96.323
    sme_B1 = df["adjCompositeScore"].between(92.546, 96.323, inclusive="left")
    sme_B2 = df["adjCompositeScore"].between(88.769, 92.546, inclusive="left")
    sme_B3 = df["adjCompositeScore"].between(84.991, 88.769, inclusive="left")
    sme_B4 = df["adjCompositeScore"].between(81.214, 84.991, inclusive="left")
    sme_C1 = df["adjCompositeScore"].between(77.437, 81.214, inclusive="left")
    sme_C2 = df["adjCompositeScore"].between(73.660, 77.437, inclusive="left")
    sme_C3 = df["adjCompositeScore"] < 73.660

    # Retail SMEs Composite Rating
    rsme_A = df["adjCompositeScore"] >= 93
    rsme_B1 = df["adjCompositeScore"].between(86, 93, inclusive="left")
    rsme_B2 = df["adjCompositeScore"].between(80, 86, inclusive="left")
    rsme_B3 = df["adjCompositeScore"].between(72, 80, inclusive="left")
    rsme_B4 = df["adjCompositeScore"].between(61, 72, inclusive="left")
    rsme_C1 = df["adjCompositeScore"].between(45, 61, inclusive="left")
    rsme_C2 = df["adjCompositeScore"].between(35, 45, inclusive="left")
    rsme_C3 = df["adjCompositeScore"] < 35

    # Project Finance Composite Rating
    pf_A = df["adjCompositeScore"] >= 83
    pf_B1 = df["adjCompositeScore"].between(78, 83, inclusive="left")
    pf_B2 = df["adjCompositeScore"].between(70, 78, inclusive="left")
    pf_B3 = df["adjCompositeScore"].between(65, 70, inclusive="left")
    pf_B4 = df["adjCompositeScore"].between(61, 65, inclusive="left")
    pf_C1 = df["adjCompositeScore"].between(58, 61, inclusive="left")
    pf_C2 = df["adjCompositeScore"].between(52, 58, inclusive="left")
    pf_C3 = df["adjCompositeScore"] < 52
    
    # Set condition for composite rating
    corporate_criteria = [
        corporate_A,
        corporate_B1,
        corporate_B2,
        corporate_B3,
        corporate_B4,
        corporate_C1,
        corporate_C2,
        corporate_C3,
    ]
    
    sme_criteria = [sme_A, sme_B1, sme_B2, sme_B3, sme_B4, sme_C1, sme_C2, sme_C3]
    rsme_criteria = [rsme_A, rsme_B1, rsme_B2, rsme_B3, rsme_B4, rsme_C1, rsme_C2, rsme_C3]
    pf_criteria = [pf_A, pf_B1, pf_B2, pf_B3, pf_B4, pf_C1, pf_C2, pf_C3]
    
    # Set Composite rating result
    composite_rate = ["A", "B1", "B2", "B3", "B4", "C1", "C2", "C3"]
    
    df["adjCompositeRate"] = np.where(
        df["Portfolio"].isin(["N", "C"]),
        np.select(corporate_criteria, composite_rate),
        np.where(
            df["Portfolio"].isin(["E", "M"]),
            np.select(sme_criteria, composite_rate),
            np.where(
                df["Portfolio"] == "R",
                np.select(rsme_criteria, composite_rate),
                np.where(
                    df["Portfolio"] == "P",
                    np.select(pf_criteria, composite_rate),
                    None,
                ),
            ),
        ),
    )
    
    return df

In [None]:
def create_additional_columns(df: pd.DataFrame) -> pd.DataFrame:
    df['NextYear'] = df['RatingDate'] + pd.DateOffset(years=1)
    df['DefaultFlag12M'] = np.where((df["NPLMonthAdj"].notnull()) & (df["NextYear"] >= df["NPLMonthAdj"]), 1, 0)
    df["NPLMonthAdjEOM"] = df["NPLMonthAdj"] + pd.offsets.MonthEnd(n=0)
    df["RatingDateEOM"] = df["RatingDate"] + pd.offsets.MonthEnd(n=0)
    df["NextRatingDateEOM"] = df["NextRatingDate"] + pd.offsets.MonthEnd(n=0)
    df["NextYearEOM"] = df["NextYear"] + pd.offsets.MonthEnd(n=0)
    df["LastDate"] = pd.to_datetime("2024-03-31")
    
    df["ExistingDefaultFlag"] = np.where(df["RatingDateEOM"] == df["NPLMonthAdjEOM"], 1, 0)
    
    df["EndofPeriod"] = pd.to_datetime(
        np.where(
            (df["NextRatingDate"].isnull()) & (df["NextYearEOM"] < df["LastDate"]),
            pd.to_datetime(df["NextYearEOM"]),
            np.where(
                (df["NextRatingDate"].isnull()) & (df["NextYearEOM"] >= df["LastDate"]),
                pd.to_datetime(df["LastDate"]),
                np.where(
                    (df["NextRatingDate"].notnull())
                    & (df["NextRatingDateEOM"] < df["LastDate"]),
                    pd.to_datetime(df["NextRatingDateEOM"]),
                    np.where(
                        (df["NextRatingDate"].notnull())
                        & (df["NextRatingDateEOM"] >= df["LastDate"]),
                        pd.to_datetime(df["LastDate"]),
                        pd.NaT,
                    ),
                ),
            ),
        )
    )
    
    df["PerformancePeriod"] = df.apply(
        lambda row: month_diff(row["RatingDateEOM"], row["EndofPeriod"]), axis=1
    )

    df["MoreThan12MFlag"] = np.where(df["PerformancePeriod"] >= 12, 1, 0)
    df["MaxMoreThan12MFlag"] = df.groupby(["CustomerRefID"])["MoreThan12MFlag"].transform(
        "max"
    )
    
    return df

def split_by_performance_period(df: pd.DataFrame) -> pd.DataFrame:
    
    cust_12m_df = df.query("MaxMoreThan12MFlag == 1").copy()
    cust_less_than_12m_df = df.query("MaxMoreThan12MFlag == 0").copy()

    cust_12m_df.sort_values(
        ["CustomerRefID", "DefaultFlag", "MoreThan12MFlag", "RatingDate"],
        ascending=False,
        inplace=True,
    )
    cust_12m_df["RequestIDOrder"] = cust_12m_df.groupby(["CustomerRefID"]).cumcount() + 1
    cust_12m_df["FlagBaseObs"] = np.where(cust_12m_df["RequestIDOrder"] == 1, 1, 0)

    cust_less_than_12m_df.sort_values(
        ["CustomerRefID", "DefaultFlag", "RatingDate"], ascending=False, inplace=True
    )
    cust_less_than_12m_df["RequestIDOrder"] = (
        cust_less_than_12m_df.groupby(["CustomerRefID"]).cumcount() + 1
    )
    cust_less_than_12m_df["FlagBaseObs"] = np.where(
        cust_less_than_12m_df["RequestIDOrder"] == 1, 1, 0
    )
    
    return cust_12m_df, cust_less_than_12m_df

In [None]:
base_corporate_path = "../data/processed/01_master_data/base_201910_202307_corporate.parquet"
base_sme_path = "../data/processed/01_master_data/base_201910_202307_sme.parquet"

In [None]:
base_corp_df = import_file(base_corporate_path)
base_corp_df["Portfolio"] = np.where(base_corp_df["RequestID"].isin(["E20120016", "M23090003"]) , "C", np.where(base_corp_df["RequestID"].str[0].isin(["C", "N"]), "C", np.NaN))

base_sme_df = import_file(base_sme_path)
base_sme_df["Portfolio"] = np.where(base_sme_df["RequestID"].str[0].isin(["M", "E"]), "M", np.NaN)

print(base_corp_df.shape)
print(base_sme_df.shape)

In [None]:
base_corp_df_1 = adjust_composite_score(base_corp_df, "CompositeScore")
base_sme_df_1 = adjust_composite_score(base_sme_df, "CompositeScore")

base_corp_df_2 = assign_composite_rate(base_corp_df_1, "adjCompositeScore")
base_sme_df_2 = assign_composite_rate(base_sme_df_1, "adjCompositeScore")

print(base_corp_df_2.shape)
print(base_sme_df_2.shape)

In [None]:
base_corp_df_3 = create_additional_columns(base_corp_df_2)
base_sme_df_3 = create_additional_columns(base_sme_df_2)

base_corp_df_12m, base_corp_df_lt12m = split_by_performance_period(base_corp_df_3)
base_sme_df_12m, base_sme_df_lt12m = split_by_performance_period(base_sme_df_3)

final_corp_data = pd.concat([base_corp_df_12m, base_corp_df_lt12m], ignore_index=True).query("FlagBaseObs == 1")
final_sme_data = pd.concat([base_sme_df_12m, base_sme_df_lt12m], ignore_index=True).query("FlagBaseObs == 1")

In [None]:
print(final_corp_data.shape)
print(final_sme_data.shape)

In [None]:
final_corp_data.to_parquet("../data/processed/02_data_sampling/corporate_customer_data.parquet")
final_sme_data.to_parquet("../data/processed/02_data_sampling/sme_customer_data.parquet")
final_corp_data.to_csv("../data/processed/02_data_sampling/corporate_customer_data.csv")
final_sme_data.to_csv("../data/processed/02_data_sampling/sme_customer_data.csv")