In [None]:
import pandas as pd

In [None]:
def import_file(file_path: str, **kwargs) -> pd.DataFrame:
    try:
        file_extension = file_path.split(".")[-1].lower()

        if file_extension == "csv":
            df = pd.read_csv(file_path, **kwargs)
        elif file_extension == "xlsx":
            df = pd.read_excel(file_path, **kwargs)
        elif file_extension == "parquet":
            df = pd.read_parquet(file_path, **kwargs)
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")

        return df

    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")
        raise
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error for file {file_path} - {str(e)}")
        raise
    except Exception as e:
        print(f"An unexpected error occured: {str(e)}")
        raise
    
def summary_good_bad(df: pd.DataFrame, fruad_exclusion: bool) -> pd.DataFrame:
    if fruad_exclusion == False:
        df_summary = (
            df.query("ExistingDefaultFlag == 0")[
                ["adjCompositeRate", "DefaultFlag12M"]
            ]
            .value_counts()
            .reset_index()
            .sort_values(["adjCompositeRate", "DefaultFlag12M"])
        )
    else:
        df_summary = (
            df.query("ExistingDefaultFlag == 0 & FruadFlag.isnull()")[
                ["adjCompositeRate", "DefaultFlag12M"]
            ]
            .value_counts()
            .reset_index()
            .sort_values(["adjCompositeRate", "DefaultFlag12M"])
        )
    
    df_pivot = df_summary.pivot(index=['adjCompositeRate'], columns='DefaultFlag12M', values='count').reset_index()
    df_pivot.columns.name = None
    df_pivot.columns = ['CompositeRate', 'Good', 'Bad']
    df_pivot.fillna({"Good": 0, "Bad": 0}, inplace=True)
    df_pivot["BadRate"] = df_pivot["Bad"] / (df_pivot["Good"] + df_pivot["Bad"])
    
    return df_pivot

In [None]:
corp_df = import_file("../data/processed/02_data_sampling/corporate_customer_data.parquet")
sme_df = import_file("../data/processed/02_data_sampling/sme_customer_data.parquet")
rsme_df = import_file("../data/processed/02_data_sampling/Retail_202005_202403.parquet")
fruad_df = import_file("../data/raw/Fruads/FruadList.xlsx", dtype={"CustomerRefID": str, "FruadFlag": int})

print(corp_df.shape)
print(sme_df.shape)
print(rsme_df.shape)
print(fruad_df.shape)

In [None]:
# Join fruad flag
corp_df_1 = corp_df.merge(fruad_df, how='left', on=['CustomerRefID'])
sme_df_1 = sme_df.merge(fruad_df, how='left', on=['CustomerRefID'])
rsme_df_1 = rsme_df.merge(fruad_df, how='left', on=['CustomerRefID'])

In [None]:
corporate_summary_df = summary_good_bad(corp_df_1, False)
corporate_summary_ex_fruad_df = summary_good_bad(corp_df_1, True)

sme_summary_df = summary_good_bad(sme_df_1, False)
sme_summary_ex_fruad_df = summary_good_bad(sme_df_1, True)

rsme_summary_df = summary_good_bad(rsme_df_1, False)
rsme_summary_ex_fruad_df = summary_good_bad(rsme_df_1, True)

In [None]:
corporate_summary_df

In [None]:
corporate_summary_ex_fruad_df

In [None]:
sme_summary_df

In [None]:
sme_summary_ex_fruad_df

In [None]:
rsme_summary_df

In [None]:
rsme_summary_ex_fruad_df

In [None]:
corporate_summary_df.to_parquet('../data/processed/03_rating_summary/corporate_rating_summary.parquet')
corporate_summary_ex_fruad_df.to_parquet('../data/processed/03_rating_summary/corporate_rating_summary_ex_fruad.parquet')

sme_summary_df.to_parquet('../data/processed/03_rating_summary/sme_rating_summary.parquet')
sme_summary_ex_fruad_df.to_parquet('../data/processed/03_rating_summary/sme_rating_summary_ex_fruad.parquet')

rsme_summary_df.to_parquet('../data/processed/03_rating_summary/rsme_rating_summary.parquet')
rsme_summary_ex_fruad_df.to_parquet('../data/processed/03_rating_summary/rsme_rating_summary_ex_fruad.parquet')