In [None]:
# Part 1: Load LOS Data and Preprocess

import pandas as pd
import numpy as np
import sqlalchemy
MAX_SEQUENCE_LENGTH=8
TWO_CLASS_STATUS_MAP = {0: 'Paid', 1: 'Not Paid'}

In [2]:
# Database connection details
db_username = 'ml_db'
db_password = 'pass%401234'
db_host = '10.192.5.43'
db_port = '5432'
db_name = 'postgres'

In [3]:
# Create a connection string and engine
conn_string = f"postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = sqlalchemy.create_engine(conn_string)

In [4]:
try:
    print("--- Loading LOS Data from PostgreSQL ---")
    query = """
    SELECT *
    FROM "Ashirvad"
    WHERE "LOAN_DATE" BETWEEN '2024-01-01' AND '2024-12-31';
    """
    df_los = pd.read_sql_query(query, con=engine)
    print(f"LOS Data loaded. Total shape: {df_los.shape}")
except Exception as e:
    print(f"Database connection or query failed: {e}")
    df_los = pd.DataFrame()

--- Loading LOS Data from PostgreSQL ---
LOS Data loaded. Total shape: (1111834, 52)


In [5]:
df_los

Unnamed: 0,CUSTOMER_ID,CUSTOMER_NAME,LOAN_ID,DATE_OF_BIRTH,BRANCH_ID,BRANCH_NAME,TEMP_CUST_ID,PHONE1,PHONE2,HOUSE_NAME,...,APPLICATION_ID,LOAN_PURPOSE,CUSTOMER_GRADING_SCORE,TENURE_in_months,emi_paid,loan_paid_percentage,NPA_STATUS_UPDATED,NPA_STATUS_UPDATED_1,NPA_STATUS_UPDATED_2,YEAR
0,31660008040,RAKHI,1660UNSL010135,2003-01-01,1660,KHAIR,5778063,8.979595e+09,6457854515.0,SHIKARBAR MOHALLA,...,29067536,Agri - Dairy,43.0,24,8.0,33.33,UNKNOWN,UNKNOWN,UNKNOWN,2024
1,32841006775,MINATI JENA,2841UNSL003441,1995-01-01,2841,BASTA BRANCH,5860101,9.970539e+09,9970538839.0,JENA HOUSE,...,29169997,Agri - Agriculture,40.0,24,8.0,33.33,UNKNOWN,UNKNOWN,UNKNOWN,2024
2,32568003134,KUNI BEHERA,2568UNSL001544,1992-01-01,2568,NIRAKARPUR-GL,8709941,6.371762e+09,6371762480.0,KALAMATI,...,27960878,Agri - Agriculture,46.0,24,9.0,37.50,UNKNOWN,UNKNOWN,UNKNOWN,2024
3,1313000611,GODAWARI BAI,1313UNSL003113,1992-01-01,1313,TAKHATPUR,154489,8.817430e+09,8817430092.0,AMORA 54,...,29906362,Agri - Agriculture,76.0,36,7.0,19.44,UNKNOWN,UNKNOWN,UNKNOWN,2024
4,32509003508,PINKI BARIK,2509UNSL001915,1981-01-01,2509,ASTARANGA-GL,8869146,9.556289e+09,9556288648.0,ATRAULAPATANA NAYAK SAHI,...,29458988,Agri - Agriculture,31.0,24,7.0,29.17,UNKNOWN,UNKNOWN,UNKNOWN,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111829,32570004664,MINA MURAH,2570UNSL003140,1969-01-01,2570,MORAN-GL,8560965,9.957155e+09,9957154502.0,,...,26225208,Petty Shop,77.0,24,10.0,41.67,UNKNOWN,UNKNOWN,UNKNOWN,2024
1111830,32007008871,JEUTI DAS,2007UNSL003248,1986-01-01,2007,MIRZA,7976059,9.957497e+09,9957496741.0,,...,21882270,Agri - Agriculture,34.0,24,13.0,,BAD,BAD,BAD,2024
1111831,32523002808,RUPA MURMU,2523UNSL002693,1980-01-01,2523,DHANIAKHALI-GL,8155349,6.297364e+09,6297364131.0,,...,23981017,Agri - Agriculture,23.0,24,12.0,,BAD,BAD,BAD,2024
1111832,32512002679,RADHIKA GHORAI,2512UNSL001062,1985-01-01,2512,EGRA-GL,8923944,7.718757e+09,7718756628.0,,...,30007642,Agri - Agriculture,63.0,24,7.0,29.17,UNKNOWN,UNKNOWN,UNKNOWN,2024


In [6]:
df_los[df_los.duplicated()]

Unnamed: 0,CUSTOMER_ID,CUSTOMER_NAME,LOAN_ID,DATE_OF_BIRTH,BRANCH_ID,BRANCH_NAME,TEMP_CUST_ID,PHONE1,PHONE2,HOUSE_NAME,...,APPLICATION_ID,LOAN_PURPOSE,CUSTOMER_GRADING_SCORE,TENURE_in_months,emi_paid,loan_paid_percentage,NPA_STATUS_UPDATED,NPA_STATUS_UPDATED_1,NPA_STATUS_UPDATED_2,YEAR


In [7]:
df_los.columns

Index(['CUSTOMER_ID', 'CUSTOMER_NAME', 'LOAN_ID', 'DATE_OF_BIRTH', 'BRANCH_ID',
       'BRANCH_NAME', 'TEMP_CUST_ID', 'PHONE1', 'PHONE2', 'HOUSE_NAME',
       'LOCALITY', 'STREET', 'ALT_HOUSE_NAME', 'ALT_LOCALITY', 'ALT_STREET',
       'CENTER_ID', 'CENTER_NAME', 'PIN_CODE', 'ALT_PIN_CODE', 'TOTAL_INCOME',
       'TOTAL_EXPENSE', 'CUSTOMER_FLAG', 'MARITAL_STATUS',
       'MARITAL_STATUS_NAME', 'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE',
       'INTEREST_RATE', 'LOAN_STATUS', 'LOAN_STATUS_DESC', 'CLS_DT',
       'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME', 'CIBIL_ID', 'CIBIL_SCORE',
       'NPA_FLAG', 'NPA_FROM_DATE', 'NPA_TO_DATE', 'NPA_STATUS',
       'OCCUPATION_ID', 'OCCUPATION_NAME', 'APPLICATION_ID', 'LOAN_PURPOSE',
       'CUSTOMER_GRADING_SCORE', 'TENURE_in_months', 'emi_paid',
       'loan_paid_percentage', 'NPA_STATUS_UPDATED', 'NPA_STATUS_UPDATED_1',
       'NPA_STATUS_UPDATED_2', 'YEAR'],
      dtype='object')

In [8]:
df_los.dtypes

CUSTOMER_ID                        int64
CUSTOMER_NAME                     object
LOAN_ID                           object
DATE_OF_BIRTH             datetime64[ns]
BRANCH_ID                          int64
BRANCH_NAME                       object
TEMP_CUST_ID                       int64
PHONE1                           float64
PHONE2                            object
HOUSE_NAME                        object
LOCALITY                          object
STREET                            object
ALT_HOUSE_NAME                    object
ALT_LOCALITY                      object
ALT_STREET                        object
CENTER_ID                          int64
CENTER_NAME                       object
PIN_CODE                         float64
ALT_PIN_CODE                     float64
TOTAL_INCOME                     float64
TOTAL_EXPENSE                    float64
CUSTOMER_FLAG                     object
MARITAL_STATUS                     int64
MARITAL_STATUS_NAME               object
LOAN_DATE       

In [9]:
df_los['PIN_CODE'] = df_los['PIN_CODE'].astype(str)

In [10]:
df_los.dtypes

CUSTOMER_ID                        int64
CUSTOMER_NAME                     object
LOAN_ID                           object
DATE_OF_BIRTH             datetime64[ns]
BRANCH_ID                          int64
BRANCH_NAME                       object
TEMP_CUST_ID                       int64
PHONE1                           float64
PHONE2                            object
HOUSE_NAME                        object
LOCALITY                          object
STREET                            object
ALT_HOUSE_NAME                    object
ALT_LOCALITY                      object
ALT_STREET                        object
CENTER_ID                          int64
CENTER_NAME                       object
PIN_CODE                          object
ALT_PIN_CODE                     float64
TOTAL_INCOME                     float64
TOTAL_EXPENSE                    float64
CUSTOMER_FLAG                     object
MARITAL_STATUS                     int64
MARITAL_STATUS_NAME               object
LOAN_DATE       

In [11]:
columns_to_drop = ['CUSTOMER_ID','CUSTOMER_NAME','BRANCH_ID','TEMP_CUST_ID','PHONE1', 'PHONE2','HOUSE_NAME','LOCALITY', 'STREET',
                   'ALT_HOUSE_NAME', 'ALT_LOCALITY','ALT_STREET','CENTER_ID','ALT_PIN_CODE','MARITAL_STATUS','LOAN_STATUS',
                   'LOAN_STATUS_DESC', 'CLS_DT','CIBIL_ID','NPA_FLAG', 'NPA_FROM_DATE', 'NPA_TO_DATE', 'NPA_STATUS',
                   'OCCUPATION_ID','APPLICATION_ID', 'TENURE_in_months', 'emi_paid','loan_paid_percentage', 'NPA_STATUS_UPDATED', 
                   'NPA_STATUS_UPDATED_1','NPA_STATUS_UPDATED_2','YEAR']

In [12]:
df_los.drop(columns=columns_to_drop, inplace=True)

In [13]:
df_los.columns

Index(['LOAN_ID', 'DATE_OF_BIRTH', 'BRANCH_NAME', 'CENTER_NAME', 'PIN_CODE',
       'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG', 'MARITAL_STATUS_NAME',
       'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE', 'INTEREST_RATE',
       'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME', 'CIBIL_SCORE',
       'OCCUPATION_NAME', 'LOAN_PURPOSE', 'CUSTOMER_GRADING_SCORE'],
      dtype='object')

In [17]:
df_los.drop(columns=['CIBIL_SCORE','CUSTOMER_GRADING_SCORE'],inplace=True)

In [18]:
df_los.columns

Index(['LOAN_ID', 'DATE_OF_BIRTH', 'BRANCH_NAME', 'CENTER_NAME', 'PIN_CODE',
       'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG', 'MARITAL_STATUS_NAME',
       'LOAN_DATE', 'LOAN_AMOUNT', 'TENURE', 'INTEREST_RATE',
       'DISBURSED_AMOUNT', 'CYCLE', 'STATE_NAME', 'OCCUPATION_NAME',
       'LOAN_PURPOSE'],
      dtype='object')

In [19]:
df_los.shape

(1111834, 18)

In [20]:
from datetime import datetime
df_los['DATE_OF_BIRTH'] = pd.to_datetime(df_los['DATE_OF_BIRTH'], errors='coerce')
df_los['LOAN_DATE'] = pd.to_datetime(df_los['LOAN_DATE'], errors='coerce')

In [21]:
df_los['AGE'] = (df_los['LOAN_DATE'] - df_los['DATE_OF_BIRTH']).dt.days // 365.25

In [22]:
#df_los.drop(columns=['DATE_OF_BIRTH', 'LOAN_DATE'], inplace=True)
df_los.drop(columns=['DATE_OF_BIRTH'], inplace=True)

In [23]:
df_los.columns

Index(['LOAN_ID', 'BRANCH_NAME', 'CENTER_NAME', 'PIN_CODE', 'TOTAL_INCOME',
       'TOTAL_EXPENSE', 'CUSTOMER_FLAG', 'MARITAL_STATUS_NAME', 'LOAN_DATE',
       'LOAN_AMOUNT', 'TENURE', 'INTEREST_RATE', 'DISBURSED_AMOUNT', 'CYCLE',
       'STATE_NAME', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE'],
      dtype='object')

In [24]:
df_los.shape

(1111834, 18)

In [26]:
df_los.columns

Index(['LOAN_ID', 'BRANCH_NAME', 'CENTER_NAME', 'PIN_CODE', 'TOTAL_INCOME',
       'TOTAL_EXPENSE', 'CUSTOMER_FLAG', 'MARITAL_STATUS_NAME', 'LOAN_DATE',
       'LOAN_AMOUNT', 'TENURE', 'INTEREST_RATE', 'DISBURSED_AMOUNT', 'CYCLE',
       'STATE_NAME', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE'],
      dtype='object')

In [27]:
df_los = df_los[df_los['CUSTOMER_FLAG'] != 'X'].copy()

In [28]:
df_los.dtypes

LOAN_ID                        object
BRANCH_NAME                    object
CENTER_NAME                    object
PIN_CODE                       object
TOTAL_INCOME                  float64
TOTAL_EXPENSE                 float64
CUSTOMER_FLAG                  object
MARITAL_STATUS_NAME            object
LOAN_DATE              datetime64[ns]
LOAN_AMOUNT                     int64
TENURE                          int64
INTEREST_RATE                 float64
DISBURSED_AMOUNT                int64
CYCLE                           int64
STATE_NAME                     object
OCCUPATION_NAME                object
LOAN_PURPOSE                   object
AGE                           float64
dtype: object

In [29]:
df_los.drop(['BRANCH_NAME', 'CENTER_NAME'],axis=1,inplace=True)

In [32]:
df_los.isna().sum()

LOAN_ID                 0
PIN_CODE                0
TOTAL_INCOME            0
TOTAL_EXPENSE          27
CUSTOMER_FLAG           0
MARITAL_STATUS_NAME     0
LOAN_DATE               0
LOAN_AMOUNT             0
TENURE                  0
INTEREST_RATE           0
DISBURSED_AMOUNT        0
CYCLE                   0
STATE_NAME              0
OCCUPATION_NAME        14
LOAN_PURPOSE            0
AGE                     0
dtype: int64

In [None]:
df_los["TOTAL_INCOME"]= df_los["TOTAL_INCOME"].fillna(20000.0)

In [33]:
df_los["TOTAL_EXPENSE"]=df_los["TOTAL_EXPENSE"].fillna(1760.0)

In [34]:
df_los["OCCUPATION_NAME"]=df_los["OCCUPATION_NAME"].fillna("UNKNOWN")

In [35]:
df_los['MARITAL_STATUS_NAME'] = df_los['MARITAL_STATUS_NAME'].replace(['UNMARRIED', 'SINGLE'], 'UNMARRIED/SINGLE')

In [36]:
df_los.isna().sum()

LOAN_ID                0
PIN_CODE               0
TOTAL_INCOME           0
TOTAL_EXPENSE          0
CUSTOMER_FLAG          0
MARITAL_STATUS_NAME    0
LOAN_DATE              0
LOAN_AMOUNT            0
TENURE                 0
INTEREST_RATE          0
DISBURSED_AMOUNT       0
CYCLE                  0
STATE_NAME             0
OCCUPATION_NAME        0
LOAN_PURPOSE           0
AGE                    0
dtype: int64

In [40]:
import os
from sklearn.preprocessing import StandardScaler
import joblib
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
NEW_LMS_FOLDER_PATH = "TestData_Oct2025prediction"

In [None]:
import glob
all_lms_dfs = []
lms_files = sorted(glob.glob(os.path.join(NEW_LMS_FOLDER_PATH, 'test_*.xlsx')))
    
if not lms_files:
    raise FileNotFoundError(f"No Excel files found matching 'test_*.xlsx' in {NEW_LMS_FOLDER_PATH}.")
    
print(f"--- Loading {len(lms_files)} LMS Excel file(s) ---")
for file_name in lms_files:
    try:
        df = pd.read_excel(file_name)
        all_lms_dfs.append(df)
        print(f"   ✅ Loaded: {os.path.basename(file_name)}")
    except Exception as e:
        print(f"   ❌ Error loading {os.path.basename(file_name)}: {e}")
            
df_lms = pd.concat(all_lms_dfs, ignore_index=True)

--- Loading 24 LMS Excel file(s) ---
   ✅ Loaded: test_001.xlsx
   ✅ Loaded: test_002.xlsx
   ✅ Loaded: test_003.xlsx
   ✅ Loaded: test_004.xlsx
   ✅ Loaded: test_005.xlsx
   ✅ Loaded: test_006.xlsx
   ✅ Loaded: test_007.xlsx
   ✅ Loaded: test_008.xlsx
   ✅ Loaded: test_009.xlsx
   ✅ Loaded: test_010.xlsx
   ✅ Loaded: test_011.xlsx
   ✅ Loaded: test_012.xlsx
   ✅ Loaded: test_013.xlsx
   ✅ Loaded: test_014.xlsx
   ✅ Loaded: test_015.xlsx
   ✅ Loaded: test_016.xlsx
   ✅ Loaded: test_017.xlsx
   ✅ Loaded: test_018.xlsx
   ✅ Loaded: test_019.xlsx
   ✅ Loaded: test_020.xlsx
   ✅ Loaded: test_021.xlsx
   ✅ Loaded: test_022.xlsx
   ✅ Loaded: test_023.xlsx
   ✅ Loaded: test_024.xlsx


NameError: name 'optimize_df_memory' is not defined

In [43]:
df_lms['DUE_DATE'] = pd.to_datetime(df_lms['DUE_DATE'], errors='coerce')
df_lms['PAID_DT'] = pd.to_datetime(df_lms['PAID_DT'], errors='coerce')
df_lms.loc[df_lms['STATUS'] == 1, 'PAID_DT'] = pd.NaT 
df_lms = df_lms.sort_values(by=['LOAN_ID', 'INSTALLMENT_NO']).reset_index(drop=True)

In [44]:
df_lms['DAYS_LATE'] = (df_lms['PAID_DT'] - df_lms['DUE_DATE']).dt.days
df_lms['DAYS_BETWEEN_DUE_DATES'] = df_lms.groupby('LOAN_ID')['DUE_DATE'].diff().dt.days

In [45]:
df_lms['REPAYMENT_SCHEDULE_CAT'] = np.select(
    [
        df_lms['DAYS_BETWEEN_DUE_DATES'].isnull(), # Condition for the first installment
        df_lms['DAYS_BETWEEN_DUE_DATES'].isin([28, 29, 30, 31]),
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 7,
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 14,
        df_lms['DAYS_BETWEEN_DUE_DATES'] == 56
    ],
    [
        'Initial', # Category for the first installment
        'Monthly',
        'Weekly',
        'Bi-Weekly',
        'Bi-Monthly'
    ],
    default='Other' # Assigns 'Other' to any values not specified
)

In [46]:
has_weekly = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: 'Weekly' in x.unique()
)
has_monthly = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: 'Monthly' in x.unique()
)

is_hybrid = (has_weekly) & (has_monthly)

mode_schedule = df_lms.groupby('LOAN_ID')['REPAYMENT_SCHEDULE_CAT'].transform(
    lambda x: x.mode()[0] if not x.mode().empty else 'Initial'
)

df_lms['LOAN_SCHEDULE_TYPE'] = np.where(
    is_hybrid,
    'Hybrid',
    mode_schedule
)

print("Vectorized computation complete. Final distribution:")
print(df_lms['LOAN_SCHEDULE_TYPE'].value_counts())

Vectorized computation complete. Final distribution:
LOAN_SCHEDULE_TYPE
Monthly    18329364
Hybrid       525358
Weekly          819
Initial           6
Name: count, dtype: int64


In [47]:
df_lms['IS_UNPAID'] = np.where(df_lms['STATUS'] == 1, 1, 0)
df_lms['IS_DAYS_LATE_MISSING'] = df_lms['DAYS_LATE'].isna().astype(int)
df_lms['DAYS_LATE'] = df_lms['DAYS_LATE'].fillna(0)
df_lms['PAID_RATIO'] = df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT']
df_lms['PAID_RATIO'] = df_lms['PAID_RATIO'].clip(upper=1.0)
df_lms['DELTA_DAYS_LATE'] = df_lms.groupby('LOAN_ID')['DAYS_LATE'].diff().fillna(0)
#df_lms.loc[df_lms['IS_UNPAID'] == 1, 'DELTA_DAYS_LATE'] = 0 

In [48]:
GRACE_PERIOD_DAYS = 2 
PAID_PERCENTAGE_THRESHOLD = 0.90
conditions_behavior = [
    (df_lms['STATUS'] == 1) | 
    ((df_lms['STATUS'] == 0) & (df_lms['DAYS_LATE'] > GRACE_PERIOD_DAYS)) | 
    ((df_lms['STATUS'] == 2) & (
        (df_lms['DAYS_LATE'] > GRACE_PERIOD_DAYS) | 
        (df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT'] < PAID_PERCENTAGE_THRESHOLD)
    )),
    (df_lms['STATUS'] == 0) & (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS),
    (df_lms['STATUS'] == 2) & (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS) & 
    (df_lms['PAID_AMOUNT'] / df_lms['INSTALLMENT_AMOUNT'] >= PAID_PERCENTAGE_THRESHOLD)
]
choices_behavior = [1, 0, 0]
df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] = np.select(conditions_behavior, choices_behavior, default=-1).astype(np.int8)

In [49]:
df_lms['REMAINING_EMI_RATIO'] = 1 - df_lms["PAID_RATIO"]
df_lms['COMPOSITE_RISK'] = df_lms['DAYS_LATE'] + (df_lms['REMAINING_EMI_RATIO'] * 10)
conditions_score = [
    df_lms['IS_UNPAID'] == 1,
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 1) & (df_lms['IS_UNPAID'] == 0),
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 0) & (df_lms['DAYS_LATE'] > 0) & 
    (df_lms['DAYS_LATE'] <= GRACE_PERIOD_DAYS) & (df_lms['IS_UNPAID'] == 0),
    (df_lms['CURRENT_EMI_BEHAVIOR_LABEL'] == 0) & (df_lms['DAYS_LATE'] <= 0) & 
    (df_lms['IS_UNPAID'] == 0)
]
    
choices_score = [
    -100,
    np.maximum(0.0, np.minimum(0.30, 0.30 - (df_lms['COMPOSITE_RISK'] * 0.03))),
    1.0 / (1 + df_lms['COMPOSITE_RISK']),
    1.5 + (np.abs(df_lms['DAYS_LATE']) / 10)
]
df_lms['PAYMENT_SCORE'] = np.select(conditions_score, choices_score, default=0).astype(np.float32)

In [50]:
rank_choices = [4, 3, 2, 1]
df_lms['PAYMENT_SCORE_RANK'] = np.select(conditions_score, rank_choices).astype(np.int8)

In [51]:
# --- VALIDATION MODE CONSTANT ---
TARGET_BACKTEST_EMI = 1 
actual_targets = df_lms.groupby('LOAN_ID').tail(TARGET_BACKTEST_EMI).copy()
actual_targets = actual_targets[['LOAN_ID', 'CURRENT_EMI_BEHAVIOR_LABEL']].rename(
    columns={'CURRENT_EMI_BEHAVIOR_LABEL': 'ACTUAL_NEXT_EMI_ISSUE'}
)

In [52]:
df_lms

Unnamed: 0,LOAN_ID,INSTALLMENT_NO,DUE_DATE,INSTALLMENT_AMOUNT,INTEREST_AMOUNT,STATUS,PAID_DT,LOAN_DATE,LOAN_AMOUNT,TENURE,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,REMAINING_EMI_RATIO,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK
0,1001UNSL004655,1,2024-02-27,2010,1507,0,2024-02-27 09:18:56,2024-01-04,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
1,1001UNSL004655,2,2024-03-26,2010,757,0,2024-03-26 10:46:21,2024-01-04,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
2,1001UNSL004655,3,2024-04-23,2010,733,0,2024-04-23 11:54:33,2024-01-04,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
3,1001UNSL004655,4,2024-05-21,2010,709,0,2024-05-21 19:34:59,2024-01-04,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
4,1001UNSL004655,5,2024-06-18,2010,684,0,2024-06-18 12:07:14,2024-01-04,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18855542,4415UNSL000212,9,2025-07-10,1990,547,0,2025-07-10 07:55:28,2024-10-18,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
18855543,4415UNSL000212,10,2025-08-07,1990,521,0,2025-08-07 08:58:05,2024-10-18,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
18855544,4415UNSL000212,11,2025-09-04,1990,494,0,2025-09-04 08:26:08,2024-10-18,40000,2,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
18855545,4415UNSL000212,12,2025-10-02,1990,466,0,2025-09-29 16:33:23,2024-10-18,40000,2,...,Monthly,0,0,1.0,-3.0,0,0.0,-3.0,1.8,1


In [53]:
actual_targets

Unnamed: 0,LOAN_ID,ACTUAL_NEXT_EMI_ISSUE
21,1001UNSL004655,1
43,1001UNSL004656,1
65,1001UNSL004657,1
87,1001UNSL004658,0
109,1001UNSL004659,0
...,...,...
18855494,4415UNSL000206,0
18855507,4415UNSL000207,0
18855520,4415UNSL000208,0
18855533,4415UNSL000211,0


In [54]:
RENAME_MAP = {
    'LOAN_AMOUNT': 'LOAN_AMOUNT_LMS',
    'TENURE': 'TENURE_LMS',
    'INTEREST_RATE': 'INTEREST_RATE_LMS'
}

df_lms.rename(columns=RENAME_MAP, inplace=True)

In [55]:
df_lms.shape

(18855547, 26)

In [56]:
df_lms.columns

Index(['LOAN_ID', 'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT',
       'INTEREST_AMOUNT', 'STATUS', 'PAID_DT', 'LOAN_DATE', 'LOAN_AMOUNT_LMS',
       'TENURE_LMS', 'INTEREST_RATE_LMS', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT',
       'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'REMAINING_EMI_RATIO',
       'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK'],
      dtype='object')

In [57]:
df_lms_full, df_actual_targets = df_lms, actual_targets

In [None]:
indices_to_drop = df_lms_full.groupby('LOAN_ID').tail(TARGET_BACKTEST_EMI).index
df_lms = df_lms_full.drop(indices_to_drop).reset_index(drop=True)

In [58]:
df_actual_targets

Unnamed: 0,LOAN_ID,ACTUAL_NEXT_EMI_ISSUE
21,1001UNSL004655,1
43,1001UNSL004656,1
65,1001UNSL004657,1
87,1001UNSL004658,0
109,1001UNSL004659,0
...,...,...
18855494,4415UNSL000206,0
18855507,4415UNSL000207,0
18855520,4415UNSL000208,0
18855533,4415UNSL000211,0


In [61]:
indices_to_drop = df_lms_full.groupby('LOAN_ID').tail(TARGET_BACKTEST_EMI).index
df_lms = df_lms_full.drop(indices_to_drop).reset_index(drop=True)

In [63]:
print("df_lms_full shape : ",df_lms_full.shape)
print("df_lms  shape : ",df_lms.shape)

df_lms_full shape :  (18855547, 26)
df_lms  shape :  (17831866, 26)


In [64]:
18855547-17831866

1023681

In [65]:
df_lms_full["LOAN_ID"].nunique()

1023681

In [67]:
RENAME_MAP_LOS = {
    'LOAN_AMOUNT': 'LOAN_AMOUNT_STATIC', 
    'TENURE': 'TENURE_STATIC', 
    'INTEREST_RATE': 'INTEREST_RATE_STATIC'
}
df_los.rename(columns=RENAME_MAP_LOS, inplace=True)

In [68]:
df_combined = pd.merge(df_los, df_lms, on='LOAN_ID', how='inner', suffixes=('_static', '_lms'))

In [69]:
df_combined

Unnamed: 0,LOAN_ID,PIN_CODE,TOTAL_INCOME,TOTAL_EXPENSE,CUSTOMER_FLAG,MARITAL_STATUS_NAME,LOAN_DATE_static,LOAN_AMOUNT_STATIC,TENURE_STATIC,INTEREST_RATE_STATIC,...,LOAN_SCHEDULE_TYPE,IS_UNPAID,IS_DAYS_LATE_MISSING,PAID_RATIO,DELTA_DAYS_LATE,CURRENT_EMI_BEHAVIOR_LABEL,REMAINING_EMI_RATIO,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK
0,1660UNSL010135,202138.0,22500.0,1590.0,A,MARRIED,2024-05-22,60000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
1,1660UNSL010135,202138.0,22500.0,1590.0,A,MARRIED,2024-05-22,60000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
2,1660UNSL010135,202138.0,22500.0,1590.0,A,MARRIED,2024-05-22,60000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
3,1660UNSL010135,202138.0,22500.0,1590.0,A,MARRIED,2024-05-22,60000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
4,1660UNSL010135,202138.0,22500.0,1590.0,A,MARRIED,2024-05-22,60000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17557664,2512UNSL001062,721429.0,21000.0,780.0,A,MARRIED,2024-06-19,40000,2,25.0,...,Monthly,0,0,1.0,12.0,0,0.0,-16.0,3.1,1
17557665,2512UNSL001062,721429.0,21000.0,780.0,A,MARRIED,2024-06-19,40000,2,25.0,...,Monthly,0,0,1.0,16.0,0,0.0,0.0,1.5,1
17557666,2512UNSL001062,721429.0,21000.0,780.0,A,MARRIED,2024-06-19,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1
17557667,2512UNSL001062,721429.0,21000.0,780.0,A,MARRIED,2024-06-19,40000,2,25.0,...,Monthly,0,0,1.0,0.0,0,0.0,0.0,1.5,1


In [72]:
df_combined[df_combined["LOAN_ID"]=="2512UNSL001062"][["LOAN_ID","DAYS_LATE","DELTA_DAYS_LATE","COMPOSITE_RISK","PAYMENT_SCORE","PAYMENT_SCORE_RANK"]]

Unnamed: 0,LOAN_ID,DAYS_LATE,DELTA_DAYS_LATE,COMPOSITE_RISK,PAYMENT_SCORE,PAYMENT_SCORE_RANK
17557653,2512UNSL001062,0.0,0.0,0.0,1.5,1
17557654,2512UNSL001062,-28.0,-28.0,-28.0,4.3,1
17557655,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557656,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557657,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557658,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557659,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557660,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557661,2512UNSL001062,-28.0,0.0,-28.0,4.3,1
17557662,2512UNSL001062,-28.0,0.0,-28.0,4.3,1


In [73]:
df_combined.columns

Index(['LOAN_ID', 'PIN_CODE', 'TOTAL_INCOME', 'TOTAL_EXPENSE', 'CUSTOMER_FLAG',
       'MARITAL_STATUS_NAME', 'LOAN_DATE_static', 'LOAN_AMOUNT_STATIC',
       'TENURE_STATIC', 'INTEREST_RATE_STATIC', 'DISBURSED_AMOUNT', 'CYCLE',
       'STATE_NAME', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'AGE',
       'INSTALLMENT_NO', 'DUE_DATE', 'INSTALLMENT_AMOUNT', 'INTEREST_AMOUNT',
       'STATUS', 'PAID_DT', 'LOAN_DATE_lms', 'LOAN_AMOUNT_LMS', 'TENURE_LMS',
       'INTEREST_RATE_LMS', 'PRINCIPAL_AMOUNT', 'PAID_AMOUNT', 'DAYS_LATE',
       'DAYS_BETWEEN_DUE_DATES', 'REPAYMENT_SCHEDULE_CAT',
       'LOAN_SCHEDULE_TYPE', 'IS_UNPAID', 'IS_DAYS_LATE_MISSING', 'PAID_RATIO',
       'DELTA_DAYS_LATE', 'CURRENT_EMI_BEHAVIOR_LABEL', 'REMAINING_EMI_RATIO',
       'COMPOSITE_RISK', 'PAYMENT_SCORE', 'PAYMENT_SCORE_RANK'],
      dtype='object')

In [74]:
df_combined.isna().sum()

LOAN_ID                             0
PIN_CODE                            0
TOTAL_INCOME                        0
TOTAL_EXPENSE                       0
CUSTOMER_FLAG                       0
MARITAL_STATUS_NAME                 0
LOAN_DATE_static                    0
LOAN_AMOUNT_STATIC                  0
TENURE_STATIC                       0
INTEREST_RATE_STATIC                0
DISBURSED_AMOUNT                    0
CYCLE                               0
STATE_NAME                          0
OCCUPATION_NAME                     0
LOAN_PURPOSE                        0
AGE                                 0
INSTALLMENT_NO                      0
DUE_DATE                            0
INSTALLMENT_AMOUNT                  0
INTEREST_AMOUNT                     0
STATUS                              0
PAID_DT                       2506490
LOAN_DATE_lms                       0
LOAN_AMOUNT_LMS                     0
TENURE_LMS                          0
INTEREST_RATE_LMS                   0
PRINCIPAL_AM

In [75]:
df_combined.rename(columns={
    "LOAN_DATE_static": "LOAN_DATE", 
    "LOAN_AMOUNT_STATIC": "LOAN_AMOUNT", 
    "TENURE_STATIC": "TENURE", 
    "INTEREST_RATE_STATIC": "INTEREST_RATE"
}, inplace=True)

In [76]:
cols_to_drop = [
    c for c in ['LOAN_DATE_lms', 'LOAN_AMOUNT_lms', 'TENURE_lms', 'INTEREST_RATE_lms', 
                'DISBURSED_AMOUNT', 'PIN_CODE', 'IS_DAYS_LATE_MISSING', 'STATUS', 
                'PAID_DT', 'PAID_AMOUNT', 'REMAINING_EMI_RATIO', 'TENURE', 'INTEREST_RATE'] 
    if c in df_combined.columns
]
df_combined.drop(columns=cols_to_drop, errors='ignore', inplace=True)

In [77]:
df_combined["DAYS_BETWEEN_DUE_DATES"].fillna(0, inplace=True)

In [None]:
print("   Creating RECENT_PAYMENT_SCORE (rolling window feature)...")
ROLLING_WINDOW_SIZE = 3
df_combined['RECENT_PAYMENT_SCORE'] = df_combined.groupby('LOAN_ID')['PAYMENT_SCORE'].rolling(
    window=ROLLING_WINDOW_SIZE, min_periods=1
).mean().reset_index(level=0, drop=True).astype(np.float32)
df_combined['RECENT_PAYMENT_SCORE'] = df_combined.groupby('LOAN_ID')['RECENT_PAYMENT_SCORE'].shift(1)
#overall_mean_score = df_combined['PAYMENT_SCORE'].mean()
df_combined['RECENT_PAYMENT_SCORE'].fillna(-39.098625, inplace=True)

In [None]:
FLAG_ORDER = {'A': 4, 'B': 3, 'C': 2, 'D': 1}
#df_combined['CUSTOMER_FLAG_ENCODED'] = df_combined['CUSTOMER_FLAG'].astype(str).map(FLAG_ORDER).fillna(0).astype(np.int8)
df_combined['CUSTOMER_FLAG_ENCODED'] = df_combined['CUSTOMER_FLAG'].map(FLAG_ORDER).astype(np.int8)

In [None]:
category_mappings = joblib.load('embedding_category_mappings.pkl')

In [None]:
if category_mappings['occupation'] and category_mappings['purpose']:
    # Create reverse mappings (category -> code)
    occupation_to_code = {v: k+1 for k, v in category_mappings['occupation'].items()}
    purpose_to_code = {v: k+1 for k, v in category_mappings['purpose'].items()}
        
    # Apply encoding with training mappings, use 0 for unknown categories
    df_combined['OCCUPATION_NAME_ENCODED'] = df_combined['OCCUPATION_NAME'].astype(str).map(
        occupation_to_code).fillna(0).astype(np.int16)
    df_combined['LOAN_PURPOSE_ENCODED'] = df_combined['LOAN_PURPOSE'].astype(str).map(
        purpose_to_code).fillna(0).astype(np.int16)

In [None]:
df_combined.drop(columns=['CUSTOMER_FLAG', 'OCCUPATION_NAME', 'LOAN_PURPOSE', 'LOAN_DATE'], errors='ignore', inplace=True)

In [None]:
df_combined = pd.get_dummies(df_combined, columns=['REPAYMENT_SCHEDULE_CAT'], prefix='REPAYMENT_CAT')

In [78]:
# 8. Filter for the latest sequence for prediction (up to EMI N-1)
print("   Filtering to keep only necessary sequence (last MAX_SEQUENCE_LENGTH EMIs)...")
df_combined['SEQUENCE_COUNT'] = df_combined.groupby('LOAN_ID').cumcount() + 1
df_combined['REVERSE_SEQUENCE_COUNT'] = df_combined.groupby('LOAN_ID')['SEQUENCE_COUNT'].transform(
    'max') - df_combined['SEQUENCE_COUNT']
df_combined_filtered = df_combined[df_combined['REVERSE_SEQUENCE_COUNT'] < MAX_SEQUENCE_LENGTH].copy()
df_combined_filtered.drop(columns=['SEQUENCE_COUNT', 'REVERSE_SEQUENCE_COUNT'], inplace=True)

   Filtering to keep only necessary sequence (last MAX_SEQUENCE_LENGTH EMIs)...


KeyboardInterrupt: 

In [None]:
# 9. Merge Actual Target for Validation
df_combined_filtered = pd.merge(df_combined_filtered, df_actual_targets, on='LOAN_ID', how='left')
df_combined_filtered['ACTUAL_NEXT_EMI_ISSUE'] = df_combined_filtered.groupby(
    'LOAN_ID')['ACTUAL_NEXT_EMI_ISSUE'].transform('max')
df_combined_filtered.dropna(subset=['ACTUAL_NEXT_EMI_ISSUE'], inplace=True)

print(f"✅ Data preparation complete. Final shape: {df_combined_filtered.shape}")
print(f"   Unique loans: {df_combined_filtered['LOAN_ID'].nunique()}")

In [None]:
preprocessor = joblib.load('preprocessor.pkl')
model = load_model('hybrid_lstm_model.h5', compile=False) 

In [None]:
# FIX #5: Alignment of OHE Columns with Training
expected_features = list(preprocessor.feature_names_in_)
df_for_transform = df_combined_filtered.reindex(columns=expected_features, fill_value=0)

# 1. Define exactly what the model expects (from the joblib object)
expected_features = list(preprocessor.feature_names_in_)

# 2. Reindex: This is your safety net
# It aligns df_combined_filtered to match expected_features perfectly
df_aligned = df_combined_filtered.reindex(columns=expected_features, fill_value=0)

# 3. Transform
X_scaled_ohe = preprocessor.transform(df_aligned)

# 4. Reconstruct the DataFrame for the LSTM Reshaping
ALL_FINAL_COLS = list(preprocessor.get_feature_names_out())
X_predict_df = pd.DataFrame(X_scaled_ohe, columns=ALL_FINAL_COLS, index=df_combined_filtered.index)

# 5. Bring back IDs and Metadata for the final results table
X_predict_df['LOAN_ID'] = df_combined_filtered['LOAN_ID'].values
X_predict_df['ACTUAL_NEXT_EMI_ISSUE'] = df_combined_filtered['ACTUAL_NEXT_EMI_ISSUE'].values
for col in STATIC_EMBEDDING_COLS:
    X_predict_df[col] = df_combined_filtered[col].values

# training_ohe_cols = [c for c in preprocessor.feature_names_in_ if c.startswith('REPAYMENT_CAT_')]
# print(f"   Aligning OHE columns ({len(training_ohe_cols)} expected from training)...")
# for col in training_ohe_cols:
#     if col not in df_full.columns:
#         df_full[col] = 0
#         print(f"      Added missing column: {col}")

In [None]:
SEQUENTIAL_COLS_NUMERICAL = [
    'INSTALLMENT_NO', 'INSTALLMENT_AMOUNT', 'DAYS_LATE', 'DAYS_BETWEEN_DUE_DATES',
    'PAID_RATIO', 'DELTA_DAYS_LATE', 'PAYMENT_SCORE', 'COMPOSITE_RISK', 
    'RECENT_PAYMENT_SCORE', 'PAYMENT_SCORE_RANK', 'IS_UNPAID', 'CURRENT_EMI_BEHAVIOR_LABEL'
]

STATIC_COLS_NUMERICAL = ['TOTAL_INCOME', 'TOTAL_EXPENSE', 'LOAN_AMOUNT', 'AGE', 'CYCLE']
STATIC_COLS_OHE = ['MARITAL_STATUS_NAME', 'STATE_NAME', 'LOAN_SCHEDULE_TYPE']
STATIC_EMBEDDING_COLS = ['OCCUPATION_NAME_ENCODED', 'LOAN_PURPOSE_ENCODED']
#TARGET_COL = 'NEXT_EMI_LABEL'

In [None]:
NUMERICAL_FEATURES_FINAL = SEQUENTIAL_COLS_NUMERICAL + STATIC_COLS_NUMERICAL

In [None]:
# Add CUSTOMER_FLAG_ENCODED to static numerical if not present
if 'CUSTOMER_FLAG_ENCODED' not in STATIC_COLS_NUMERICAL:
    STATIC_COLS_NUMERICAL_FINAL = STATIC_COLS_NUMERICAL + ['CUSTOMER_FLAG_ENCODED']
else:
    STATIC_COLS_NUMERICAL_FINAL = STATIC_COLS_NUMERICAL

In [None]:
NUMERICAL_FEATURES_WITH_OHE = SEQUENTIAL_COLS_NUMERICAL + STATIC_COLS_NUMERICAL_FINAL + training_ohe_cols
ALL_TRANSFORM_COLS = NUMERICAL_FEATURES_WITH_OHE + STATIC_COLS_OHE

In [None]:
missing_cols = set(expected_features) - set(df_full.columns)
print(missing_cols)

In [None]:
if missing_cols:
    print(f"   ⚠️ Adding {len(missing_cols)} missing columns with zeros")
    for col in missing_cols:
        df_full[col] = 0

In [None]:
# Ensure column order matches training
df_for_transform = df_combined_filtered[expected_features].copy()
    
# FIX #7: Apply ColumnTransformer
print("   Applying StandardScaler and OneHotEncoder...")
X_scaled_ohe = preprocessor.transform(df_for_transform)
ALL_FINAL_COLS = list(preprocessor.get_feature_names_out())
    
# Reconstruct DataFrame
X_predict_df = pd.DataFrame(X_scaled_ohe, columns=ALL_FINAL_COLS, index=df_full.index)

In [None]:
# Add back non-transformed columns
X_predict_df['LOAN_ID'] = df_full['LOAN_ID'].values
X_predict_df['ACTUAL_NEXT_EMI_ISSUE'] = df_full['ACTUAL_NEXT_EMI_ISSUE'].values
for col in STATIC_EMBEDDING_COLS:
    X_predict_df[col] = df_full[col].values

# Define Final Input Feature Sets with correct prefixes
LSTM_INPUT_COLS_FINAL = [f"num__{c}" for c in SEQUENTIAL_COLS_NUMERICAL] + \
                        [f"num__{c}" for c in training_ohe_cols]
    
STATIC_DENSE_COLS_FINAL = [f"num__{c}" for c in STATIC_COLS_NUMERICAL_FINAL] + \
                            [c for c in ALL_FINAL_COLS if c.startswith('cat__')]

print(f"   LSTM input features: {len(LSTM_INPUT_COLS_FINAL)}")
print(f"   Static dense features: {len(STATIC_DENSE_COLS_FINAL)}")
print(f"   Embedding features: {len(STATIC_EMBEDDING_COLS)}")

In [None]:
# Sequence Reshaping Function
def reshape_for_prediction(X_df, lstm_cols, static_dense_cols, embedding_cols, max_len):
    grouped = X_df.groupby('LOAN_ID')
    loan_ids = list(grouped.groups.keys())
        
    X_lstm = np.zeros((len(loan_ids), max_len, len(lstm_cols)), dtype=np.float32)
    X_static_dense = np.zeros((len(loan_ids), len(static_dense_cols)), dtype=np.float32)
    X_static_embed = np.zeros((len(loan_ids), len(embedding_cols)), dtype=np.int16)
    y_actual = np.zeros((len(loan_ids),), dtype=np.int8)
        
    for i, loan_id in enumerate(loan_ids):
        loan_data = grouped.get_group(loan_id)
        sequence = loan_data[lstm_cols].values
            
        # Pad or truncate sequence
        if len(sequence) >= max_len:
            X_lstm[i, :, :] = sequence[-max_len:]
        else:
            X_lstm[i, -len(sequence):, :] = sequence

        # Get static features from last record
        last_record = loan_data.iloc[-1]
        X_static_dense[i, :] = last_record[static_dense_cols].values
        X_static_embed[i, :] = last_record[embedding_cols].values.astype(np.int16)
        y_actual[i] = last_record['ACTUAL_NEXT_EMI_ISSUE'].astype(np.int8)

    return X_lstm, X_static_dense, X_static_embed, loan_ids, y_actual

    
# Execute reshaping
print("   Reshaping sequences for model input...")
X_predict_lstm, X_predict_static_dense, X_predict_static_embed, prediction_loan_ids, y_actual = \
    reshape_for_prediction(X_predict_df, LSTM_INPUT_COLS_FINAL, STATIC_DENSE_COLS_FINAL, 
                            STATIC_EMBEDDING_COLS, MAX_SEQUENCE_LENGTH)

print(f"   Reshaped data shapes:")
print(f"      LSTM: {X_predict_lstm.shape}")
print(f"      Static Dense: {X_predict_static_dense.shape}")
print(f"      Embedding: {X_predict_static_embed.shape}")

In [None]:
# Model Inference
print("\n   Running model inference...")
prediction_inputs = {
    'lstm_input': X_predict_lstm, 
    'static_dense_input': X_predict_static_dense, 
    'embedding_input': X_predict_static_embed
}

y_pred_proba = model.predict(prediction_inputs, verbose=0)[:, 0]
y_pred_class = (y_pred_proba > 0.5).astype(np.int8)

# Inverse Transform Installment Number
print("   Extracting and inverse-transforming installment numbers...")
INSTALLMENT_NO_COL_NAME_SCALED = 'num__INSTALLMENT_NO'
CURRENT_EMI_BEHAVIOR_LABEL_COL_NAME_SCALED = 'num__CURRENT_EMI_BEHAVIOR_LABEL'
    
try:
    INSTALLMENT_NO_IDX = LSTM_INPUT_COLS_FINAL.index(INSTALLMENT_NO_COL_NAME_SCALED)
    CURRENT_EMI_BEHAVIOR_LABEL_IDX = LSTM_INPUT_COLS_FINAL.index(CURRENT_EMI_BEHAVIOR_LABEL_COL_NAME_SCALED)
except ValueError as e:
    print(f"   ❌ ERROR: Required column not found in LSTM features: {e}")
    raise

In [None]:
LAST_TIME_STEP_IDX = X_predict_lstm.shape[1] - 1
    
# Get scaler parameters for inverse transform
scaler = preprocessor.named_transformers_['num']
try:
    original_feature_index = list(preprocessor.feature_names_in_).index('INSTALLMENT_NO')
except ValueError:
    print("   ⚠️ WARNING: INSTALLMENT_NO not found in preprocessor features. Using index 0.")
    original_feature_index = 0
    
install_no_mean = scaler.mean_[original_feature_index]
install_no_scale = scaler.scale_[original_feature_index]

# Inverse transform
scaled_install_no = X_predict_lstm[:, LAST_TIME_STEP_IDX, INSTALLMENT_NO_IDX]
unscaled_install_no = (scaled_install_no * install_no_scale) + install_no_mean
install_no_final_unscaled = np.round(unscaled_install_no + 1).astype(np.int16)

# Extract Current Behavior
behavior_label_final = X_predict_lstm[:, LAST_TIME_STEP_IDX, CURRENT_EMI_BEHAVIOR_LABEL_IDX].astype(np.int8)

# Final Output DataFrame
results_df = pd.DataFrame({
    'LOAN_ID': prediction_loan_ids,
    'LAST_EMI_USED_IN_PREDICTION': np.round(unscaled_install_no).astype(np.int16),
    'PREDICTED_EMI_NO': install_no_final_unscaled,
    'LAST_EMI_BEHAVIOR_USED_DESC': [TWO_CLASS_STATUS_MAP.get(lbl, 'Unknown') for lbl in behavior_label_final],
    'PROBABILITY_OF_EMI_ISSUE': y_pred_proba.astype(np.float32),
    'PREDICTED_EMI_ISSUE': y_pred_class,
    'PREDICTED_EMI_STATUS': [TWO_CLASS_STATUS_MAP.get(pred, 'Unknown') for pred in y_pred_class],
    'ACTUAL_EMI_ISSUE': y_actual,
    'ACTUAL_EMI_STATUS': [TWO_CLASS_STATUS_MAP.get(actual, 'Unknown') for actual in y_actual]
})
    
#return results_df, y_pred_class, y_actual

In [None]:
# --- Performance Calculation ---
print("\n--- Calculating Performance Metrics ---")
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_actual, y_pred_class)
cm = confusion_matrix(y_actual, y_pred_class)
report = classification_report(y_actual, y_pred_class, target_names=['Paid (0)', 'Not Paid (1)'], output_dict=True)

# Export Results
OUTPUT_FILE = 'Oct2025_Backtest_Predictions_Results.xlsx'
results_df.to_excel(OUTPUT_FILE, index=False)
        
print("\n" + "="*80)
print("✅ BACK-TESTING COMPLETE!")
print("="*80)
print(f"Total Loans Evaluated: {results_df.shape[0]}")
print(f"Output saved to: {OUTPUT_FILE}")

print("\n### 📈 MODEL PERFORMANCE ON BACK-TEST DATA ###")
print(f"Overall Accuracy: {accuracy:.4f}\n")
        
print("Confusion Matrix:")
cm_df = pd.DataFrame(cm, index=['Actual Paid (0)', 'Actual Not Paid (1)'],  columns=['Predicted Paid (0)', 'Predicted Not Paid (1)'])
print(cm_df)
        
print("\n Classification Report (Key Metrics):")
print(f"Precision (Not Paid): {report['Not Paid (1)']['precision']:.4f}")
print(f"Recall (Not Paid): {report['Not Paid (1)']['recall']:.4f}")
print(f"F1-Score (Not Paid): {report['Not Paid (1)']['f1-score']:.4f}")

print("\n" + "="*80)
print("\nSample Results (First 10 Predictions):")
print("="*80)
display_cols = ['LOAN_ID', 'PREDICTED_EMI_NO', 'PROBABILITY_OF_EMI_ISSUE', 'PREDICTED_EMI_STATUS', 'ACTUAL_EMI_STATUS']
print(results_df[display_cols].head(10).to_string(index=False))
        
print("\n✅ All operations completed successfully!")