# import package

In [None]:
import pandas as pd
import numpy as np
import math
import os
import warnings
from datetime import date
from scipy.stats import t
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# function

In [None]:
# npl
def adjust_thai_year(date_str):
    try:
        date_part = date_str.split()[0]
        year, month, day = map(int, date_part.split('-'))
        year -= 543
        return f"{year:04d}-{month:02d}-{day:02d}"
    except Exception as e:
        return None 

def import_npl(npl_directory):
    npl_path=os.path.join(npl_directory,'npl_list.parquet')
    npl=pd.read_parquet(npl_path)
    npl.rename(columns={'NPLMonth':'NPL month_TH'},inplace=True)
    npl['NPL month'] = npl['NPL month_TH'].apply(adjust_thai_year)
    npl['NPL month'] = pd.to_datetime(npl['NPL month'], format="%Y-%m-%d", errors='coerce')
    datemin = npl['NPL month'].dt.date.min()
    datemax = npl['NPL month'].dt.date.max()
    npl['CustomerRefID'] = npl['CustomerRefID'].fillna('').astype(str).apply(lambda x: x.zfill(7))
    datarange2= str(datemin)+'_'+str(datemax)
    print('ระยะข้อมูล npl',datarange2)
    return npl

def data_preparation(df,mode,name): 
    #drop null
    print('1.Start: {}\n'.format(len(df)))
    retail_drop=df.dropna(subset=['RatingDate'])
    print('2.drop RatingDate = null: {}\n'.format(len(retail_drop)))
  
    #keep only lastest RatingDate
    retail_drop['RatingDate'] = pd.to_datetime(retail_drop['RatingDate'])
    retail_drop.sort_values(by='RatingDate', ascending=False, inplace=True)
    
    retail_drop['MaxRatingDate'] = retail_drop.groupby('RequestID')['RatingDate'].transform('max')
    retail_drop_max_rating_date = retail_drop[retail_drop['RatingDate'] == retail_drop['MaxRatingDate']].copy()
    retail_drop_max_rating_date.drop(columns=['MaxRatingDate'], inplace=True)
    retail_drop_max_rating_date.reset_index(drop=True, inplace=True)
    df_filter = retail_drop_max_rating_date
    print('3.keep latest RatingDate for each RequestID: {}\n'.format(len(df_filter)))
    
    #keep customerrefid !='0' null or '999999'
    retail_status=df_filter[(df_filter['CustomerRefID']!='0') &(df_filter['CustomerRefID'].notnull()==True) & (df_filter['CustomerRefID']!='0000000') & (df_filter['CustomerRefID']!='9999999')& (df_filter['CustomerRefID']!='')]
    print('4.customerRefID != 0 null or 999999: {}\n'.format(len(retail_status)))

    #keep only Rating year!=0 
    retail_final=retail_status[retail_status['RatingYear']!= 0]
    print('6.keep Rating year!=0 : {}\n'.format(len(retail_final)))

    #keep only composite score not null
    if name=='df_Composite':
        retail_final=retail_final[retail_final['CompositeScore'].notnull()]
        print('7.composite score not null : {}\n'.format(len(retail_final)))
        retail_final=retail_final[retail_final['CompositeRate'].notnull()]
        print('8.compositerate not null : {}\n'.format(len(retail_final)))
        
    return retail_final

def mask_flag12_requestID(df):
    df['flag'] = df.apply(lambda row: 1 if row['RatingDate'].year == row['NPL month'].year and row['RatingDate'].month == row['NPL month'].month else 0, axis=1)
    df = df.sort_values(by=['RequestID', 'RatingDate'])
    group_df = df.groupby('RequestID')
    dfs = []
    # Iterate over each group
    for lg, group in group_df:
        # Initialize 'flag12' column with 0
        group['flag12'] = 0
        # Iterate over rows in the group
        for i in range(len(group)):
            # Check the flag value for the current row
            rating_date = group.iloc[i]['RatingDate']
            twelve_months_later = rating_date + pd.DateOffset(months=12)
            
            # Check if any 'NPL month' is within the 12 months period
            if group[(group['NPL month'] >= rating_date) & (group['NPL month'] <= twelve_months_later)].shape[0] > 0:
                group.at[group.index[i], 'flag12'] = 1

        # Append the modified group to the list
        dfs.append(group)
        
    # Combine all groups back into a single DataFrame
    result_df = pd.concat(dfs)
    return result_df
    
def request_prepare_RequestID(x,mode):
    
    merged_df1 = data_preparation(x,mode,'df_Composite')
    requestID_df= merged_df1 
    
    #new column nextRatingdate
    df_Com=requestID_df
    groupdf=df_Com.groupby('CustomerRefID')
    list=[]
    for name,group in groupdf:
        group=group.sort_values('RatingDate')
        group['nextRatingdate']=group['RatingDate'].shift(-1)
        list.append(group)
    requestID_df=pd.concat(list)

    data_merge_npl=requestID_df
    print('หลังmerge npl มีกี่row',len(data_merge_npl))
    print('จำนวนcustomerRefID',data_merge_npl['CustomerRefID'].nunique())

    groupdf=data_merge_npl.groupby('CustomerRefID')
    list=[]
    for n,group in groupdf:
        group['mask']=1
        mask=  (group['NPL month'] > group['RatingDate'])
        group.loc[mask , 'will npl'] = 1
        mask= (group['NPL month'] < group['RatingDate']) 
        group.loc[mask , 'mask'] ='drop'
        mask=  (group['NPL month'] > group['nextRatingdate'])
        group.loc[mask , 'mask'] = 'drop'
        group['mask'] = group['mask'].apply(lambda x: np.nan if x is None else x)
        group['nextRatingdate'] = group['nextRatingdate'].fillna(pd.Timestamp('2024-03-31'))
        group['month_diff'] = group.apply(lambda row: (row['nextRatingdate'].year - row['RatingDate'].year) * 12 + (row['nextRatingdate'].month - row['RatingDate'].month) if pd.notnull(row['nextRatingdate']) else None, axis=1)
        inside=group.groupby('RequestID')
        list_inside=[]
        for i,p in inside:
            if (p['mask'] == 1).any():
                p=p[p['mask']==1]
            else: 
                #keep lastest date
                p=p.iloc[-1:]
                #change NPL month-> null
                p['NPL month']=np.nan
            list_inside.append(p)
        group=pd.concat(list_inside)
        list.append(group)

    x=pd.concat(list)
    x.drop(columns=['mask'],inplace=True)
    requestID_df=x
    print('ใส่ nextratingdate finish')
    # flag12month
    requestID_df=mask_flag12_requestID(requestID_df)
    print('flag12month finish')

    return requestID_df

def change_to_customer_level(requestID_df):
    
    listallport=[]
    for char in requestID_df['RequestID_Char'].unique():
        print(char)
        df_port=requestID_df[requestID_df['RequestID_Char']==char]
        df_group=df_port.groupby('CustomerRefID')
        for i,df in df_group:
            df=df.sort_values('RatingDate')
            df['NPL month'] = df['NPL month'].fillna(method='ffill')
            #มี npl month  
            df3=df[df['will npl']==1]
            
            if len(df3)!=0:
                #มี npl เลือกตัวล่าสุด
                listallport.append(df3.iloc[-1:])
            else:
                df2=df[df['month_diff']>=12]
                if len(df2)!=0:
                    # ไม่มีnpl เลือกตัวที่ window>12
                    listallport.append(df2.iloc[-1:])
                else:
                    # ไม่มี nplเลือกตัวที่ล่าสุด
                    listallport.append(df.iloc[-1:])
    
    customer_df=pd.concat(listallport)
    # add defaultflag
    customer_df['defaultflag']=0
    customer_df.loc[customer_df['NPL month'].notnull(),'defaultflag']=1        
    print('finish change to customer level')
    return customer_df


# lastest rating date each customer
def data_preparation2(df,name):
    
    #drop null
    print('1.Start: {}\n'.format(len(df)))
    df_filter=df.dropna(subset=['RatingDate'])
    print('2.drop RatingDate = null: {}\n'.format(len(df_filter)))
  
    #keep only lastest RatingDate
    df_filter['RatingDate'] = pd.to_datetime(df_filter['RatingDate'])
    df_filter.sort_values(by='RatingDate', ascending=False, inplace=True)
    
    #keep customerrefid !='0' null or '999999'
    retail_status=df_filter[(df_filter['CustomerRefID']!='0') &(df_filter['CustomerRefID'].notnull()==True) & (df_filter['CustomerRefID']!='0000000') & (df_filter['CustomerRefID']!='9999999') & (df_filter['CustomerRefID']!='') ]
    print('4.customerRefID != 0 null or 999999: {}\n'.format(len(retail_status)))

    #keep only Rating year!=0 
    retail_final=retail_status[retail_status['RatingYear']!= 0]
    print('6.keep Rating year!=0 : {}\n'.format(len(retail_final)))

    #keep only composite score not null
    if name=='df_Composite':
        retail_final=retail_final[retail_final['CompositeScore'].notnull()]
        print('7.composite score not null : {}\n'.format(len(retail_final)))
        retail_final=retail_final[retail_final['CompositeRate'].notnull()]
        print('8.compositerate not null : {}\n'.format(len(retail_final)))
        
    return retail_final

def mask_flag12(df):
    df['flag'] = df.apply(lambda row: 1 if row['RatingDate'].year == row['NPL month'].year and row['RatingDate'].month == row['NPL month'].month else 0, axis=1)
    df = df.sort_values(by=['CustomerRefID', 'RatingDate'])
    
    df['flag12'] = 0
    
    for index, row in df.iterrows():
        rating_date = row['RatingDate']
        twelve_months_later = rating_date + pd.DateOffset(months=12)
        
        # Check if any 'NPL month' is within the 12 months period
        if  (row['NPL month'] >= rating_date) & (row['NPL month'] <= twelve_months_later):
            df.at[index, 'flag12'] = 1

    return df
    
def request_prepare(x):
    merged_df1 = data_preparation2(x,'df_Composite')
    requestID_df= merged_df1 
    
    #new column nextRatingdate
    df_Com=requestID_df
    groupdf=df_Com.groupby('CustomerRefID')
    list=[]
    for name,group in groupdf:
        group=group.sort_values('RatingDate')
        group['nextRatingdate']=group['RatingDate'].shift(-1)
        list.append(group)
    requestID_df=pd.concat(list)

    data_merge_npl=requestID_df
    print('หลังmerge npl มีกี่row',len(data_merge_npl))
    print('จำนวนcustomerRefID',data_merge_npl['CustomerRefID'].nunique())

    groupdf=data_merge_npl.groupby('CustomerRefID')
    list=[]
    for n,group in groupdf:
        group['mask']=1
        mask=  (group['NPL month'] > group['RatingDate'])
        group.loc[mask , 'will npl'] = 1
        mask= (group['NPL month'] < group['RatingDate']) 
        group.loc[mask , 'mask'] ='drop'
        mask=  (group['NPL month'] > group['nextRatingdate'])
        group.loc[mask , 'mask'] = 'drop'
        group['mask'] = group['mask'].apply(lambda x: np.nan if x is None else x)
        group['nextRatingdate'] = group['nextRatingdate'].fillna(pd.Timestamp('2024-03-31'))
        group['month_diff'] = group.apply(lambda row: (row['nextRatingdate'].year - row['RatingDate'].year) * 12 + (row['nextRatingdate'].month - row['RatingDate'].month) if pd.notnull(row['nextRatingdate']) else None, axis=1)
        list.append(group)

    x=pd.concat(list)
    x.drop(columns=['mask'],inplace=True)
    requestID_df=x
    print('ใส่ nextratingdate finish')
    # flag12month
    requestID_df=mask_flag12(requestID_df)
    print('flag12month finish')
    return requestID_df

def remove_duplicated(cus_retail3_df):
    #ลบตัวซ้ำ เลือกอันที่มีnplก่อน แล้วถ้าไม่มีทั้งคู่เลือกเอาratinfdateล่าสุด ถ้า nplทั้งสองก็เอาratingdate ล่าสุด
    duplicated_df = cus_retail3_df[cus_retail3_df.duplicated('CustomerRefID', keep=False)]
    duplicated_ids=duplicated_df['CustomerRefID'].unique()
    keep=[]
    x=duplicated_df.groupby('CustomerRefID')
    for groupname,group in x:
        group.sort_values('RatingDate')
        default=group[group['defaultflag']==1]
        if len(default)>1:
            keep.append(default.iloc[-1])
        elif len(default)==0:
            keep.append(group.iloc[0])     
        else: keep.append(default.iloc[-1])
    
    final = pd.DataFrame(keep)
    # Include non-duplicated rows with 'defaultflag' = 0
    df_non_duplicated = cus_retail3_df[~cus_retail3_df['CustomerRefID'].isin(duplicated_ids)]
    df_final = pd.concat([df_non_duplicated, final]).sort_values(by='RatingDate')
    return df_final

def change_criteria(df_com ):

    df_com = df_com.reset_index(drop=True)
    # หาร10 คะแนนเก่า
    df_com.loc[df_com['CompositeScore']>0,'adjCompositeScore']=df_com['CompositeScore']
    df_com.loc[df_com['CompositeScore']==0,'adjCompositeScore']=df_com["BusinessScore"]+df_com["IndustryScore"]+df_com["adjFinancialScore"]
    df_com.loc[df_com['adjCompositeScore']>100,'adjCompositeScore']=df_com['adjCompositeScore']/10
    #ใช้เกณฑ์ใหม่
    df_com=composite_criteria_change(df_com)
    return df_com

def composite_criteria_change(df):
    df['adjCompositeRate'] = None
    # Apply the conditions
    mask = df['RequestID_Char'] == 'C'
    df.loc[mask & (df['adjCompositeScore'] >= 92), 'adjCompositeRate'] = 'A'
    df.loc[mask & (df['adjCompositeScore'] >= 89) & (df['adjCompositeScore'] < 92), 'adjCompositeRate'] = 'B1'
    df.loc[mask &(df['adjCompositeScore'] >= 86) & (df['adjCompositeScore'] < 89), 'adjCompositeRate'] = 'B2'
    df.loc[mask &(df['adjCompositeScore'] >= 81) & (df['adjCompositeScore'] < 86), 'adjCompositeRate'] = 'B3'
    df.loc[mask &(df['adjCompositeScore'] >= 78) & (df['adjCompositeScore'] < 81), 'adjCompositeRate'] = 'B4'
    df.loc[mask &(df['adjCompositeScore'] >= 74) & (df['adjCompositeScore'] < 78), 'adjCompositeRate'] = 'C1'
    df.loc[mask &(df['adjCompositeScore'] >= 70) & (df['adjCompositeScore'] < 74), 'adjCompositeRate'] = 'C2'
    df.loc[mask & (df['adjCompositeScore'] < 70), 'adjCompositeRate'] = 'C3'
    
    mask_m = df['RequestID_Char'] == 'M'
    df.loc[mask_m & (df['adjCompositeScore'] >= 96.323), 'adjCompositeRate'] = 'A'
    df.loc[mask_m &(df['adjCompositeScore'] >= 92.546) & (df['adjCompositeScore'] < 96.323), 'adjCompositeRate'] = 'B1'
    df.loc[mask_m &(df['adjCompositeScore'] >= 88.769) & (df['adjCompositeScore'] < 92.546), 'adjCompositeRate'] = 'B2'
    df.loc[mask_m &(df['adjCompositeScore'] >= 84.991) & (df['adjCompositeScore'] < 88.769), 'adjCompositeRate'] = 'B3'
    df.loc[mask_m &(df['adjCompositeScore'] >= 81.214) & (df['adjCompositeScore'] < 84.991), 'adjCompositeRate'] = 'B4'
    df.loc[mask_m &(df['adjCompositeScore'] >= 77.437) & (df['adjCompositeScore'] <= 81.214), 'adjCompositeRate'] = 'C1'
    df.loc[mask_m &(df['adjCompositeScore'] >= 73.660) & (df['adjCompositeScore'] <= 77.437), 'adjCompositeRate'] = 'C2'
    df.loc[mask_m &(df['adjCompositeScore'] < 73.660), 'adjCompositeRate'] = 'C3'
    
    mask_r = df['RequestID_Char'] == 'R'
    df.loc[mask_r &(df['adjCompositeScore'] >= 93), 'adjCompositeRate'] = 'A'
    df.loc[mask_r &(df['adjCompositeScore'] >= 86) & (df['adjCompositeScore'] < 93), 'adjCompositeRate'] = 'B1'
    df.loc[mask_r &(df['adjCompositeScore'] >= 80) & (df['adjCompositeScore'] < 86), 'adjCompositeRate'] = 'B2'
    df.loc[mask_r &(df['adjCompositeScore'] >= 72) & (df['adjCompositeScore'] < 80), 'adjCompositeRate'] = 'B3'
    df.loc[mask_r &(df['adjCompositeScore'] >= 61) & (df['adjCompositeScore'] < 72), 'adjCompositeRate'] = 'B4'
    df.loc[mask_r &(df['adjCompositeScore'] >= 45) & (df['adjCompositeScore'] < 61), 'adjCompositeRate'] = 'C1'
    df.loc[mask_r &(df['adjCompositeScore'] >= 35) & (df['adjCompositeScore'] < 45), 'adjCompositeRate'] = 'C2'
    df.loc[mask_r &(df['adjCompositeScore'] < 35), 'adjCompositeRate'] = 'C3'

    mask_p = df['RequestID_Char'] == 'P'
    df.loc[mask_p &(df['adjCompositeScore'] >= 83), 'adjCompositeRate'] = 'A'
    df.loc[mask_p &(df['adjCompositeScore'] >= 78) & (df['adjCompositeScore'] < 83), 'adjCompositeRate'] = 'B1'
    df.loc[mask_p &(df['adjCompositeScore'] >= 70) & (df['adjCompositeScore'] < 78), 'adjCompositeRate'] = 'B2'
    df.loc[mask_p &(df['adjCompositeScore'] >= 65) & (df['adjCompositeScore'] < 70), 'adjCompositeRate'] = 'B3'
    df.loc[mask_p &(df['adjCompositeScore'] >= 61) & (df['adjCompositeScore'] < 65), 'adjCompositeRate'] = 'B4'
    df.loc[mask_p &(df['adjCompositeScore'] >= 58) & (df['adjCompositeScore'] < 61), 'adjCompositeRate'] = 'C1'
    df.loc[mask_p &(df['adjCompositeScore'] >= 52) & (df['adjCompositeScore'] < 58), 'adjCompositeRate'] = 'C2'
    df.loc[mask_p &(df['adjCompositeScore'] < 52), 'adjCompositeRate'] = 'C3'  
    return df


# import data

In [None]:
# file 1
Dec22_Mar24="../data/processed/01_master_data/ScoreResult_Composite_Dec22_Mar24.parquet"
Dec22_Mar24_df=pd.read_parquet(Dec22_Mar24)
retail_Dec22_Mar24_df=Dec22_Mar24_df[Dec22_Mar24_df['RequestID'].str[0]=='R']
retail_Dec22_Mar24_df['RatingY']=retail_Dec22_Mar24_df['RatingDate'].dt.year
retail_Mar23_Mar24_df= retail_Dec22_Mar24_df[retail_Dec22_Mar24_df['RatingDate']>='2023-03-01']


#npl file
npl_directory="../data/raw/NPLs"
npl=import_npl(npl_directory)
npl.rename(columns={'CusomterName':'CustomerName'},inplace=True)


# file 2
credit_scoring="../data/raw/Credit Scoring_21022566.xlsx"
df_credit_retail=pd.read_excel(credit_scoring,converters={'รหัสลูกค้า':str,'เลขทะเบียนนิติบุคคล':str,'รหัสเอกสาร':str,'วันที่-เวลา':str})
df_credit_retail.rename(columns={'รหัสลูกค้า':'CustomerRefID','ชื่อบริษัท':'CustomerName','วันที่-เวลา':'RatingDate_TH'},inplace=True)
df_credit_retail['CustomerRefID'] = df_credit_retail['CustomerRefID'].fillna('').astype(str).apply(lambda x: x.zfill(7))
df_credit_retail.rename(columns={'F':'adjFinancialScore','B':'BusinessScore','I':'IndustryScore','S':'CompositeScore','R':'CompositeRate'},inplace=True)
df_credit_retail['RatingDate'] = df_credit_retail['RatingDate_TH'].apply(adjust_thai_year)
df_credit_retail['RatingDate'] =pd.to_datetime(df_credit_retail['RatingDate']) 
df_credit_retail['RatingYear']=df_credit_retail['RatingDate'].dt.year

# run python

In [None]:
#file1
merge_df=retail_Mar23_Mar24_df.merge(npl[['CustomerRefID','NPL month']],on='CustomerRefID',how='left')
requestID_df=request_prepare_RequestID(merge_df,'s')
requestID_df['RequestID_Char']='R'
customer_df=change_to_customer_level(requestID_df)

#file 1 customer level
npl_cus_df=customer_df[customer_df['NPL month'].notnull()]
not_npl_cus_df=customer_df[customer_df['NPL month'].isnull()]
#keep only one that WorkFlowStepName']== 'Complete') | (not_npl_cus_df['WorkFlowStepName']== 'Final Rating
not_npl_cus__filter_df=not_npl_cus_df[(not_npl_cus_df['WorkFlowStepName']== 'Complete') | (not_npl_cus_df['WorkFlowStepName']== 'Final Rating') ].copy()
cus_retail_df=pd.concat([not_npl_cus__filter_df,npl_cus_df])

# file 2
df_credit_retail1=df_credit_retail.merge(npl[['CustomerRefID','NPL month']],on='CustomerRefID',how='left')
df_credit_retail2=request_prepare(df_credit_retail1)
df_credit_retail2['RequestID_Char']='R'

#file 2 customer level
df_credit_retail2_filter=change_to_customer_level(df_credit_retail2)
non_npl_retail2_df=df_credit_retail2_filter[df_credit_retail2_filter['defaultflag']!=1]
npl_retail2_df=df_credit_retail2_filter[df_credit_retail2_filter['defaultflag']==1]
#keep only one that WorkFlowStepName']== 'Complete') | (non_npl_retail2_df['WorkFlowStepName']== 'Final Rating
non_npl_retail2_filter_df=non_npl_retail2_df[(non_npl_retail2_df['สถานะ']== 'อนุมัติ')  ].copy()
cus_retail2_df=pd.concat([non_npl_retail2_filter_df,npl_retail2_df])
cus_retail2_df['RatingY']=cus_retail2_df['RatingYear']

# combine 2 file
cus_retail3_df=pd.concat([cus_retail_df,cus_retail2_df])
df_final=remove_duplicated(cus_retail3_df)

# Export file
df_final['Portfolio']='R'
df_final=change_criteria(df_final)
df_final=composite_criteria_change(df_final)
df_final.rename(columns={'flag':'ExistingDefaultFlag','flag12':'DefaultFlag12M'},inplace=True)
co_drop=['Unnamed: 48','#',	'nextRatingdate','will npl','month_diff','RequestID_Char','No']
df_final2=df_final.drop(columns=co_drop)
file_path1="../data/processed/02_data_sampling/Retail_202005_202403.csv"
file_path2="../data/processed/02_data_sampling/Retail_202005_202403.parquet"
df_final2.to_csv(file_path1, index=False, encoding='utf-8-sig')
df_final2.to_parquet(file_path2)
print('Finish')